summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/802/fc.c2
-rw-r--r--net/802/fddi.c12
-rw-r--r--net/802/hippi.c2
-rw-r--r--net/802/tr.c2
-rw-r--r--net/8021q/vlan.c93
-rw-r--r--net/8021q/vlan.h17
-rw-r--r--net/8021q/vlan_core.c117
-rw-r--r--net/8021q/vlan_dev.c13
-rw-r--r--net/9p/client.c62
-rw-r--r--net/9p/trans_fd.c2
-rw-r--r--net/9p/trans_rdma.c29
-rw-r--r--net/9p/trans_virtio.c3
-rw-r--r--net/Kconfig3
-rw-r--r--net/Makefile1
-rw-r--r--net/atm/br2684.c12
-rw-r--r--net/atm/clip.c4
-rw-r--r--net/atm/common.c2
-rw-r--r--net/atm/lec.c1
-rw-r--r--net/atm/mpc.c2
-rw-r--r--net/atm/proc.c1
-rw-r--r--net/ax25/af_ax25.c2
-rw-r--r--net/ax25/ax25_ds_timer.c2
-rw-r--r--net/ax25/ax25_route.c4
-rw-r--r--net/bluetooth/af_bluetooth.c114
-rw-r--r--net/bluetooth/cmtp/core.c6
-rw-r--r--net/bluetooth/hci_core.c1
-rw-r--r--net/bluetooth/hci_sysfs.c21
-rw-r--r--net/bluetooth/hidp/core.c8
-rw-r--r--net/bluetooth/l2cap.c133
-rw-r--r--net/bluetooth/lib.c4
-rw-r--r--net/bluetooth/rfcomm/core.c43
-rw-r--r--net/bluetooth/rfcomm/sock.c108
-rw-r--r--net/bluetooth/rfcomm/tty.c8
-rw-r--r--net/bridge/br_device.c8
-rw-r--r--net/bridge/br_if.c29
-rw-r--r--net/bridge/br_input.c4
-rw-r--r--net/bridge/br_netfilter.c138
-rw-r--r--net/bridge/netfilter/ebt_vlan.c25
-rw-r--r--net/bridge/netfilter/ebtables.c15
-rw-r--r--net/caif/caif_dev.c24
-rw-r--r--net/caif/caif_socket.c44
-rw-r--r--net/caif/cfcnfg.c49
-rw-r--r--net/caif/cfctrl.c59
-rw-r--r--net/caif/cfdbgl.c4
-rw-r--r--net/caif/cfdgml.c11
-rw-r--r--net/caif/cffrml.c14
-rw-r--r--net/caif/cfmuxl.c14
-rw-r--r--net/caif/cfpkt_skbuff.c50
-rw-r--r--net/caif/cfrfml.c14
-rw-r--r--net/caif/cfserl.c4
-rw-r--r--net/caif/cfsrvl.c17
-rw-r--r--net/caif/cfutill.c12
-rw-r--r--net/caif/cfveil.c11
-rw-r--r--net/caif/cfvidl.c6
-rw-r--r--net/caif/chnl_net.c47
-rw-r--r--net/can/bcm.c41
-rw-r--r--net/can/raw.c37
-rw-r--r--net/ceph/Kconfig28
-rw-r--r--net/ceph/Makefile37
-rw-r--r--net/ceph/armor.c103
-rw-r--r--net/ceph/auth.c259
-rw-r--r--net/ceph/auth_none.c132
-rw-r--r--net/ceph/auth_none.h29
-rw-r--r--net/ceph/auth_x.c688
-rw-r--r--net/ceph/auth_x.h50
-rw-r--r--net/ceph/auth_x_protocol.h90
-rw-r--r--net/ceph/buffer.c68
-rw-r--r--net/ceph/ceph_common.c529
-rw-r--r--net/ceph/ceph_fs.c75
-rw-r--r--net/ceph/ceph_hash.c118
-rw-r--r--net/ceph/ceph_strings.c84
-rw-r--r--net/ceph/crush/crush.c151
-rw-r--r--net/ceph/crush/hash.c149
-rw-r--r--net/ceph/crush/mapper.c609
-rw-r--r--net/ceph/crypto.c412
-rw-r--r--net/ceph/crypto.h48
-rw-r--r--net/ceph/debugfs.c267
-rw-r--r--net/ceph/messenger.c2453
-rw-r--r--net/ceph/mon_client.c1027
-rw-r--r--net/ceph/msgpool.c64
-rw-r--r--net/ceph/osd_client.c1773
-rw-r--r--net/ceph/osdmap.c1128
-rw-r--r--net/ceph/pagelist.c154
-rw-r--r--net/ceph/pagevec.c223
-rw-r--r--net/core/datagram.c6
-rw-r--r--net/core/dev.c625
-rw-r--r--net/core/dst.c39
-rw-r--r--net/core/ethtool.c97
-rw-r--r--net/core/fib_rules.c16
-rw-r--r--net/core/filter.c10
-rw-r--r--net/core/flow.c82
-rw-r--r--net/core/gen_estimator.c16
-rw-r--r--net/core/iovec.c11
-rw-r--r--net/core/neighbour.c486
-rw-r--r--net/core/net-sysfs.c39
-rw-r--r--net/core/net-sysfs.h4
-rw-r--r--net/core/net-traces.c1
-rw-r--r--net/core/netpoll.c6
-rw-r--r--net/core/pktgen.c12
-rw-r--r--net/core/rtnetlink.c39
-rw-r--r--net/core/skbuff.c112
-rw-r--r--net/core/sock.c17
-rw-r--r--net/core/stream.c8
-rw-r--r--net/core/utils.c15
-rw-r--r--net/dccp/ccid.h52
-rw-r--r--net/dccp/ccids/Kconfig31
-rw-r--r--net/dccp/ccids/ccid2.c289
-rw-r--r--net/dccp/ccids/ccid2.h35
-rw-r--r--net/dccp/ccids/ccid3.c256
-rw-r--r--net/dccp/ccids/ccid3.h51
-rw-r--r--net/dccp/ccids/lib/loss_interval.c2
-rw-r--r--net/dccp/ccids/lib/packet_history.c39
-rw-r--r--net/dccp/ccids/lib/packet_history.h22
-rw-r--r--net/dccp/ccids/lib/tfrc.h1
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c14
-rw-r--r--net/dccp/dccp.h46
-rw-r--r--net/dccp/feat.c10
-rw-r--r--net/dccp/feat.h1
-rw-r--r--net/dccp/input.c20
-rw-r--r--net/dccp/ipv4.c10
-rw-r--r--net/dccp/ipv6.c10
-rw-r--r--net/dccp/minisocks.c30
-rw-r--r--net/dccp/options.c31
-rw-r--r--net/dccp/output.c20
-rw-r--r--net/dccp/probe.c1
-rw-r--r--net/dccp/proto.c50
-rw-r--r--net/decnet/dn_neigh.c13
-rw-r--r--net/decnet/dn_nsp_out.c8
-rw-r--r--net/decnet/dn_route.c3
-rw-r--r--net/dns_resolver/dns_key.c92
-rw-r--r--net/dns_resolver/dns_query.c5
-rw-r--r--net/dsa/Kconfig2
-rw-r--r--net/econet/af_econet.c6
-rw-r--r--net/ethernet/eth.c8
-rw-r--r--net/ipv4/Kconfig11
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c8
-rw-r--r--net/ipv4/arp.c245
-rw-r--r--net/ipv4/datagram.c7
-rw-r--r--net/ipv4/devinet.c11
-rw-r--r--net/ipv4/fib_frontend.c207
-rw-r--r--net/ipv4/fib_hash.c291
-rw-r--r--net/ipv4/fib_lookup.h11
-rw-r--r--net/ipv4/fib_rules.c13
-rw-r--r--net/ipv4/fib_semantics.c297
-rw-r--r--net/ipv4/fib_trie.c80
-rw-r--r--net/ipv4/gre.c151
-rw-r--r--net/ipv4/icmp.c4
-rw-r--r--net/ipv4/igmp.c34
-rw-r--r--net/ipv4/inet_diag.c2
-rw-r--r--net/ipv4/inet_hashtables.c28
-rw-r--r--net/ipv4/ip_fragment.c6
-rw-r--r--net/ipv4/ip_gre.c245
-rw-r--r--net/ipv4/ip_options.c3
-rw-r--r--net/ipv4/ip_output.c43
-rw-r--r--net/ipv4/ip_sockglue.c3
-rw-r--r--net/ipv4/ipip.c212
-rw-r--r--net/ipv4/ipmr.c428
-rw-r--r--net/ipv4/netfilter/Kconfig4
-rw-r--r--net/ipv4/netfilter/arp_tables.c69
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c2
-rw-r--r--net/ipv4/netfilter/ip_tables.c89
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c31
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c145
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c1
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c28
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c53
-rw-r--r--net/ipv4/netfilter/nf_nat_ftp.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c53
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c76
-rw-r--r--net/ipv4/netfilter/nf_nat_irc.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c17
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c27
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c6
-rw-r--r--net/ipv4/protocol.c31
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c199
-rw-r--r--net/ipv4/tcp.c52
-rw-r--r--net/ipv4/tcp_cong.c5
-rw-r--r--net/ipv4/tcp_input.c60
-rw-r--r--net/ipv4/tcp_ipv4.c12
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_output.c31
-rw-r--r--net/ipv4/tcp_probe.c1
-rw-r--r--net/ipv4/tcp_timer.c68
-rw-r--r--net/ipv4/tcp_westwood.c2
-rw-r--r--net/ipv4/tunnel4.c19
-rw-r--r--net/ipv4/udp.c48
-rw-r--r--net/ipv4/xfrm4_policy.c6
-rw-r--r--net/ipv4/xfrm4_state.c33
-rw-r--r--net/ipv4/xfrm4_tunnel.c4
-rw-r--r--net/ipv6/addrconf.c19
-rw-r--r--net/ipv6/addrlabel.c10
-rw-r--r--net/ipv6/af_inet6.c9
-rw-r--r--net/ipv6/datagram.c26
-rw-r--r--net/ipv6/exthdrs_core.c4
-rw-r--r--net/ipv6/fib6_rules.c3
-rw-r--r--net/ipv6/ip6_fib.c9
-rw-r--r--net/ipv6/ip6_output.c24
-rw-r--r--net/ipv6/ip6_tunnel.c157
-rw-r--r--net/ipv6/ip6mr.c1
-rw-r--r--net/ipv6/ipv6_sockglue.c23
-rw-r--r--net/ipv6/ndisc.c36
-rw-r--r--net/ipv6/netfilter/Kconfig4
-rw-r--r--net/ipv6/netfilter/Makefile5
-rw-r--r--net/ipv6/netfilter/ip6_tables.c103
-rw-r--r--net/ipv6/netfilter/ip6t_LOG.c157
-rw-r--r--net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c78
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c96
-rw-r--r--net/ipv6/netfilter/nf_defrag_ipv6_hooks.c131
-rw-r--r--net/ipv6/protocol.c32
-rw-r--r--net/ipv6/raw.c12
-rw-r--r--net/ipv6/reassembly.c73
-rw-r--r--net/ipv6/route.c84
-rw-r--r--net/ipv6/sit.c165
-rw-r--r--net/ipv6/tcp_ipv6.c14
-rw-r--r--net/ipv6/tunnel6.c17
-rw-r--r--net/ipv6/udp.c26
-rw-r--r--net/ipv6/xfrm6_policy.c10
-rw-r--r--net/ipv6/xfrm6_state.c33
-rw-r--r--net/ipv6/xfrm6_tunnel.c8
-rw-r--r--net/ipx/Kconfig1
-rw-r--r--net/irda/af_irda.c384
-rw-r--r--net/irda/discovery.c2
-rw-r--r--net/irda/ircomm/ircomm_tty.c4
-rw-r--r--net/irda/iriap.c3
-rw-r--r--net/irda/irlan/irlan_common.c2
-rw-r--r--net/irda/irlan/irlan_eth.c34
-rw-r--r--net/irda/irlan/irlan_event.c2
-rw-r--r--net/irda/irlmp.c2
-rw-r--r--net/irda/irlmp_frame.c2
-rw-r--r--net/irda/irnet/irnet.h2
-rw-r--r--net/irda/irnet/irnet_irda.c22
-rw-r--r--net/irda/irnet/irnet_ppp.c69
-rw-r--r--net/irda/irnet/irnet_ppp.h3
-rw-r--r--net/irda/parameters.c4
-rw-r--r--net/key/af_key.c4
-rw-r--r--net/l2tp/l2tp_eth.c3
-rw-r--r--net/l2tp/l2tp_ip.c4
-rw-r--r--net/l2tp/l2tp_ppp.c2
-rw-r--r--net/llc/af_llc.c3
-rw-r--r--net/llc/llc_station.c2
-rw-r--r--net/mac80211/aes_ccm.c6
-rw-r--r--net/mac80211/aes_cmac.c6
-rw-r--r--net/mac80211/agg-rx.c30
-rw-r--r--net/mac80211/agg-tx.c16
-rw-r--r--net/mac80211/cfg.c244
-rw-r--r--net/mac80211/chan.c2
-rw-r--r--net/mac80211/debugfs.c28
-rw-r--r--net/mac80211/debugfs_key.c57
-rw-r--r--net/mac80211/debugfs_netdev.c4
-rw-r--r--net/mac80211/debugfs_sta.c7
-rw-r--r--net/mac80211/driver-ops.h14
-rw-r--r--net/mac80211/driver-trace.h42
-rw-r--r--net/mac80211/ht.c47
-rw-r--r--net/mac80211/ibss.c77
-rw-r--r--net/mac80211/ieee80211_i.h133
-rw-r--r--net/mac80211/iface.c460
-rw-r--r--net/mac80211/key.c168
-rw-r--r--net/mac80211/key.h13
-rw-r--r--net/mac80211/main.c200
-rw-r--r--net/mac80211/mesh_plink.c17
-rw-r--r--net/mac80211/mlme.c173
-rw-r--r--net/mac80211/offchannel.c26
-rw-r--r--net/mac80211/pm.c2
-rw-r--r--net/mac80211/rate.c14
-rw-r--r--net/mac80211/rc80211_minstrel_debugfs.c1
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c7
-rw-r--r--net/mac80211/rc80211_minstrel_ht_debugfs.c3
-rw-r--r--net/mac80211/rc80211_pid_debugfs.c3
-rw-r--r--net/mac80211/rx.c819
-rw-r--r--net/mac80211/scan.c179
-rw-r--r--net/mac80211/sta_info.c52
-rw-r--r--net/mac80211/sta_info.h24
-rw-r--r--net/mac80211/status.c18
-rw-r--r--net/mac80211/tx.c73
-rw-r--r--net/mac80211/util.c102
-rw-r--r--net/mac80211/wep.c10
-rw-r--r--net/mac80211/work.c39
-rw-r--r--net/mac80211/wpa.c34
-rw-r--r--net/netfilter/core.c8
-rw-r--r--net/netfilter/ipvs/Kconfig20
-rw-r--r--net/netfilter/ipvs/Makefile10
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c6
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c286
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c818
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c392
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c203
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c292
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c147
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c169
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_ah_esp.c99
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c27
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c52
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c51
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c47
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c46
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c690
-rw-r--r--net/netfilter/nf_conntrack_core.c131
-rw-r--r--net/netfilter/nf_conntrack_ecache.c4
-rw-r--r--net/netfilter/nf_conntrack_expect.c68
-rw-r--r--net/netfilter/nf_conntrack_extend.c6
-rw-r--r--net/netfilter/nf_conntrack_netlink.c121
-rw-r--r--net/netfilter/nf_conntrack_proto.c4
-rw-r--r--net/netfilter/nf_conntrack_sip.c44
-rw-r--r--net/netfilter/nf_conntrack_standalone.c28
-rw-r--r--net/netfilter/nf_log.c2
-rw-r--r--net/netfilter/nf_queue.c2
-rw-r--r--net/netfilter/nf_tproxy_core.c41
-rw-r--r--net/netfilter/x_tables.c12
-rw-r--r--net/netfilter/xt_CT.c1
-rw-r--r--net/netfilter/xt_SECMARK.c35
-rw-r--r--net/netfilter/xt_TPROXY.c366
-rw-r--r--net/netfilter/xt_hashlimit.c15
-rw-r--r--net/netfilter/xt_ipvs.c1
-rw-r--r--net/netfilter/xt_recent.c1
-rw-r--r--net/netfilter/xt_socket.c167
-rw-r--r--net/netlink/af_netlink.c78
-rw-r--r--net/netlink/genetlink.c14
-rw-r--r--net/nonet.c1
-rw-r--r--net/packet/af_packet.c4
-rw-r--r--net/phonet/Kconfig12
-rw-r--r--net/phonet/af_phonet.c17
-rw-r--r--net/phonet/datagram.c13
-rw-r--r--net/phonet/pep.c391
-rw-r--r--net/phonet/pn_dev.c5
-rw-r--r--net/phonet/socket.c289
-rw-r--r--net/rds/af_rds.c26
-rw-r--r--net/rds/bind.c82
-rw-r--r--net/rds/cong.c8
-rw-r--r--net/rds/connection.c159
-rw-r--r--net/rds/ib.c200
-rw-r--r--net/rds/ib.h104
-rw-r--r--net/rds/ib_cm.c184
-rw-r--r--net/rds/ib_rdma.c318
-rw-r--r--net/rds/ib_recv.c549
-rw-r--r--net/rds/ib_send.c682
-rw-r--r--net/rds/ib_stats.c2
-rw-r--r--net/rds/ib_sysctl.c19
-rw-r--r--net/rds/info.c12
-rw-r--r--net/rds/iw.c8
-rw-r--r--net/rds/iw.h15
-rw-r--r--net/rds/iw_cm.c14
-rw-r--r--net/rds/iw_rdma.c8
-rw-r--r--net/rds/iw_recv.c24
-rw-r--r--net/rds/iw_send.c93
-rw-r--r--net/rds/iw_sysctl.c6
-rw-r--r--net/rds/loop.c31
-rw-r--r--net/rds/message.c142
-rw-r--r--net/rds/page.c35
-rw-r--r--net/rds/rdma.c339
-rw-r--r--net/rds/rdma.h85
-rw-r--r--net/rds/rdma_transport.c44
-rw-r--r--net/rds/rdma_transport.h4
-rw-r--r--net/rds/rds.h192
-rw-r--r--net/rds/recv.c14
-rw-r--r--net/rds/send.c548
-rw-r--r--net/rds/stats.c6
-rw-r--r--net/rds/sysctl.c4
-rw-r--r--net/rds/tcp.c12
-rw-r--r--net/rds/tcp.h9
-rw-r--r--net/rds/tcp_connect.c6
-rw-r--r--net/rds/tcp_listen.c10
-rw-r--r--net/rds/tcp_recv.c21
-rw-r--r--net/rds/tcp_send.c72
-rw-r--r--net/rds/threads.c69
-rw-r--r--net/rds/transport.c19
-rw-r--r--net/rds/xlist.h80
-rw-r--r--net/rfkill/core.c1
-rw-r--r--net/rfkill/input.c2
-rw-r--r--net/rose/af_rose.c4
-rw-r--r--net/rose/rose_link.c4
-rw-r--r--net/rxrpc/ar-internal.h16
-rw-r--r--net/sched/Kconfig10
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/act_csum.c595
-rw-r--r--net/sched/act_gact.c21
-rw-r--r--net/sched/act_ipt.c14
-rw-r--r--net/sched/act_mirred.c15
-rw-r--r--net/sched/act_nat.c22
-rw-r--r--net/sched/act_police.c21
-rw-r--r--net/sched/act_simple.c11
-rw-r--r--net/sched/act_skbedit.c11
-rw-r--r--net/sched/cls_cgroup.c2
-rw-r--r--net/sched/cls_flow.c74
-rw-r--r--net/sched/cls_u32.c2
-rw-r--r--net/sched/em_meta.c6
-rw-r--r--net/sched/sch_api.c66
-rw-r--r--net/sched/sch_atm.c13
-rw-r--r--net/sched/sch_cbq.c12
-rw-r--r--net/sched/sch_drr.c4
-rw-r--r--net/sched/sch_dsmark.c6
-rw-r--r--net/sched/sch_fifo.c3
-rw-r--r--net/sched/sch_generic.c24
-rw-r--r--net/sched/sch_hfsc.c10
-rw-r--r--net/sched/sch_htb.c12
-rw-r--r--net/sched/sch_mq.c2
-rw-r--r--net/sched/sch_multiq.c3
-rw-r--r--net/sched/sch_netem.c3
-rw-r--r--net/sched/sch_prio.c2
-rw-r--r--net/sched/sch_sfq.c47
-rw-r--r--net/sched/sch_tbf.c4
-rw-r--r--net/sched/sch_teql.c10
-rw-r--r--net/sctp/associola.c2
-rw-r--r--net/sctp/auth.c8
-rw-r--r--net/sctp/chunk.c2
-rw-r--r--net/sctp/inqueue.c2
-rw-r--r--net/sctp/ipv6.c4
-rw-r--r--net/sctp/objcnt.c5
-rw-r--r--net/sctp/output.c3
-rw-r--r--net/sctp/outqueue.c34
-rw-r--r--net/sctp/probe.c5
-rw-r--r--net/sctp/protocol.c19
-rw-r--r--net/sctp/sm_make_chunk.c2
-rw-r--r--net/sctp/sm_sideeffect.c21
-rw-r--r--net/sctp/sm_statefuns.c66
-rw-r--r--net/sctp/sm_statetable.c42
-rw-r--r--net/sctp/socket.c98
-rw-r--r--net/sctp/transport.c9
-rw-r--r--net/socket.c38
-rw-r--r--net/sunrpc/Kconfig9
-rw-r--r--net/sunrpc/auth.c11
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c11
-rw-r--r--net/sunrpc/auth_gss/gss_generic_token.c44
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c10
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seqnum.c2
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c2
-rw-r--r--net/sunrpc/auth_gss/gss_spkm3_mech.c5
-rw-r--r--net/sunrpc/cache.c17
-rw-r--r--net/sunrpc/clnt.c116
-rw-r--r--net/sunrpc/rpc_pipe.c46
-rw-r--r--net/sunrpc/sched.c2
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c2
-rw-r--r--net/sunrpc/xprtrdma/verbs.c22
-rw-r--r--net/sunrpc/xprtsock.c82
-rw-r--r--net/tipc/addr.c7
-rw-r--r--net/tipc/bcast.c51
-rw-r--r--net/tipc/bcast.h3
-rw-r--r--net/tipc/bearer.c42
-rw-r--r--net/tipc/cluster.c21
-rw-r--r--net/tipc/cluster.h2
-rw-r--r--net/tipc/config.c148
-rw-r--r--net/tipc/config.h6
-rw-r--r--net/tipc/core.c38
-rw-r--r--net/tipc/core.h9
-rw-r--r--net/tipc/dbg.c17
-rw-r--r--net/tipc/dbg.h3
-rw-r--r--net/tipc/discover.c44
-rw-r--r--net/tipc/discover.h5
-rw-r--r--net/tipc/eth_media.c48
-rw-r--r--net/tipc/link.c188
-rw-r--r--net/tipc/link.h24
-rw-r--r--net/tipc/msg.c2
-rw-r--r--net/tipc/msg.h6
-rw-r--r--net/tipc/name_distr.c2
-rw-r--r--net/tipc/name_table.c67
-rw-r--r--net/tipc/net.c10
-rw-r--r--net/tipc/node.c73
-rw-r--r--net/tipc/node.h3
-rw-r--r--net/tipc/port.c295
-rw-r--r--net/tipc/port.h4
-rw-r--r--net/tipc/ref.c17
-rw-r--r--net/tipc/ref.h1
-rw-r--r--net/tipc/socket.c83
-rw-r--r--net/tipc/subscr.c77
-rw-r--r--net/tipc/subscr.h2
-rw-r--r--net/tipc/zone.c11
-rw-r--r--net/tipc/zone.h1
-rw-r--r--net/unix/af_unix.c25
-rw-r--r--net/wireless/core.c87
-rw-r--r--net/wireless/core.h34
-rw-r--r--net/wireless/debugfs.c2
-rw-r--r--net/wireless/ibss.c21
-rw-r--r--net/wireless/mlme.c227
-rw-r--r--net/wireless/nl80211.c2189
-rw-r--r--net/wireless/nl80211.h14
-rw-r--r--net/wireless/radiotap.c61
-rw-r--r--net/wireless/reg.c22
-rw-r--r--net/wireless/scan.c12
-rw-r--r--net/wireless/sme.c11
-rw-r--r--net/wireless/sysfs.c18
-rw-r--r--net/wireless/util.c40
-rw-r--r--net/wireless/wext-compat.c45
-rw-r--r--net/wireless/wext-core.c18
-rw-r--r--net/wireless/wext-priv.c2
-rw-r--r--net/wireless/wext-sme.c2
-rw-r--r--net/x25/Kconfig1
-rw-r--r--net/x25/af_x25.c34
-rw-r--r--net/xfrm/xfrm_output.c2
-rw-r--r--net/xfrm/xfrm_policy.c12
-rw-r--r--net/xfrm/xfrm_state.c45
-rw-r--r--net/xfrm/xfrm_user.c4
495 files changed, 27573 insertions, 11637 deletions
diff --git a/net/802/fc.c b/net/802/fc.c
index 34cf1ee014b..1e49f2d4ea9 100644
--- a/net/802/fc.c
+++ b/net/802/fc.c
@@ -70,7 +70,7 @@ static int fc_header(struct sk_buff *skb, struct net_device *dev,
if(daddr)
{
memcpy(fch->daddr,daddr,dev->addr_len);
- return(hdr_len);
+ return hdr_len;
}
return -hdr_len;
}
diff --git a/net/802/fddi.c b/net/802/fddi.c
index 3ef0ab0a543..94b3ad08f39 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -82,10 +82,10 @@ static int fddi_header(struct sk_buff *skb, struct net_device *dev,
if (daddr != NULL)
{
memcpy(fddi->daddr, daddr, dev->addr_len);
- return(hl);
+ return hl;
}
- return(-hl);
+ return -hl;
}
@@ -108,7 +108,7 @@ static int fddi_rebuild_header(struct sk_buff *skb)
{
printk("%s: Don't know how to resolve type %04X addresses.\n",
skb->dev->name, ntohs(fddi->hdr.llc_snap.ethertype));
- return(0);
+ return 0;
}
}
@@ -162,7 +162,7 @@ __be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
/* Assume 802.2 SNAP frames, for now */
- return(type);
+ return type;
}
EXPORT_SYMBOL(fddi_type_trans);
@@ -170,9 +170,9 @@ EXPORT_SYMBOL(fddi_type_trans);
int fddi_change_mtu(struct net_device *dev, int new_mtu)
{
if ((new_mtu < FDDI_K_SNAP_HLEN) || (new_mtu > FDDI_K_SNAP_DLEN))
- return(-EINVAL);
+ return -EINVAL;
dev->mtu = new_mtu;
- return(0);
+ return 0;
}
EXPORT_SYMBOL(fddi_change_mtu);
diff --git a/net/802/hippi.c b/net/802/hippi.c
index cd3e8e92952..91aca8780fd 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -152,7 +152,7 @@ int hippi_change_mtu(struct net_device *dev, int new_mtu)
if ((new_mtu < 68) || (new_mtu > 65280))
return -EINVAL;
dev->mtu = new_mtu;
- return(0);
+ return 0;
}
EXPORT_SYMBOL(hippi_change_mtu);
diff --git a/net/802/tr.c b/net/802/tr.c
index 1c6e596074d..5e20cf8a074 100644
--- a/net/802/tr.c
+++ b/net/802/tr.c
@@ -145,7 +145,7 @@ static int tr_header(struct sk_buff *skb, struct net_device *dev,
{
memcpy(trh->daddr,daddr,dev->addr_len);
tr_source_route(skb, trh, dev);
- return(hdr_len);
+ return hdr_len;
}
return -hdr_len;
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index a2ad1525057..05b867e4375 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -44,9 +44,6 @@
int vlan_net_id __read_mostly;
-/* Our listing of VLAN group(s) */
-static struct hlist_head vlan_group_hash[VLAN_GRP_HASH_SIZE];
-
const char vlan_fullname[] = "802.1Q VLAN Support";
const char vlan_version[] = DRV_VERSION;
static const char vlan_copyright[] = "Ben Greear <greearb@candelatech.com>";
@@ -59,40 +56,6 @@ static struct packet_type vlan_packet_type __read_mostly = {
/* End of global variables definitions. */
-static inline unsigned int vlan_grp_hashfn(unsigned int idx)
-{
- return ((idx >> VLAN_GRP_HASH_SHIFT) ^ idx) & VLAN_GRP_HASH_MASK;
-}
-
-/* Must be invoked with RCU read lock (no preempt) */
-static struct vlan_group *__vlan_find_group(struct net_device *real_dev)
-{
- struct vlan_group *grp;
- struct hlist_node *n;
- int hash = vlan_grp_hashfn(real_dev->ifindex);
-
- hlist_for_each_entry_rcu(grp, n, &vlan_group_hash[hash], hlist) {
- if (grp->real_dev == real_dev)
- return grp;
- }
-
- return NULL;
-}
-
-/* Find the protocol handler. Assumes VID < VLAN_VID_MASK.
- *
- * Must be invoked with RCU read lock (no preempt)
- */
-struct net_device *__find_vlan_dev(struct net_device *real_dev, u16 vlan_id)
-{
- struct vlan_group *grp = __vlan_find_group(real_dev);
-
- if (grp)
- return vlan_group_get_device(grp, vlan_id);
-
- return NULL;
-}
-
static void vlan_group_free(struct vlan_group *grp)
{
int i;
@@ -111,8 +74,6 @@ static struct vlan_group *vlan_group_alloc(struct net_device *real_dev)
return NULL;
grp->real_dev = real_dev;
- hlist_add_head_rcu(&grp->hlist,
- &vlan_group_hash[vlan_grp_hashfn(real_dev->ifindex)]);
return grp;
}
@@ -151,7 +112,7 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
ASSERT_RTNL();
- grp = __vlan_find_group(real_dev);
+ grp = real_dev->vlgrp;
BUG_ON(!grp);
/* Take it out of our own structures, but be sure to interlock with
@@ -173,11 +134,10 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
if (grp->nr_vlans == 0) {
vlan_gvrp_uninit_applicant(real_dev);
- if (real_dev->features & NETIF_F_HW_VLAN_RX)
+ rcu_assign_pointer(real_dev->vlgrp, NULL);
+ if (ops->ndo_vlan_rx_register)
ops->ndo_vlan_rx_register(real_dev, NULL);
- hlist_del_rcu(&grp->hlist);
-
/* Free the group, after all cpu's are done. */
call_rcu(&grp->rcu, vlan_rcu_free);
}
@@ -196,18 +156,13 @@ int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id)
return -EOPNOTSUPP;
}
- if ((real_dev->features & NETIF_F_HW_VLAN_RX) && !ops->ndo_vlan_rx_register) {
- pr_info("8021q: device %s has buggy VLAN hw accel\n", name);
- return -EOPNOTSUPP;
- }
-
if ((real_dev->features & NETIF_F_HW_VLAN_FILTER) &&
(!ops->ndo_vlan_rx_add_vid || !ops->ndo_vlan_rx_kill_vid)) {
pr_info("8021q: Device %s has buggy VLAN hw accel\n", name);
return -EOPNOTSUPP;
}
- if (__find_vlan_dev(real_dev, vlan_id) != NULL)
+ if (vlan_find_dev(real_dev, vlan_id) != NULL)
return -EEXIST;
return 0;
@@ -222,7 +177,7 @@ int register_vlan_dev(struct net_device *dev)
struct vlan_group *grp, *ngrp = NULL;
int err;
- grp = __vlan_find_group(real_dev);
+ grp = real_dev->vlgrp;
if (!grp) {
ngrp = grp = vlan_group_alloc(real_dev);
if (!grp)
@@ -252,8 +207,11 @@ int register_vlan_dev(struct net_device *dev)
vlan_group_set_device(grp, vlan_id, dev);
grp->nr_vlans++;
- if (ngrp && real_dev->features & NETIF_F_HW_VLAN_RX)
- ops->ndo_vlan_rx_register(real_dev, ngrp);
+ if (ngrp) {
+ if (ops->ndo_vlan_rx_register)
+ ops->ndo_vlan_rx_register(real_dev, ngrp);
+ rcu_assign_pointer(real_dev->vlgrp, ngrp);
+ }
if (real_dev->features & NETIF_F_HW_VLAN_FILTER)
ops->ndo_vlan_rx_add_vid(real_dev, vlan_id);
@@ -264,7 +222,6 @@ out_uninit_applicant:
vlan_gvrp_uninit_applicant(real_dev);
out_free_group:
if (ngrp) {
- hlist_del_rcu(&ngrp->hlist);
/* Free the group, after all cpu's are done. */
call_rcu(&ngrp->rcu, vlan_rcu_free);
}
@@ -321,7 +278,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
if (new_dev == NULL)
return -ENOBUFS;
- new_dev->real_num_tx_queues = real_dev->real_num_tx_queues;
+ netif_copy_real_num_queues(new_dev, real_dev);
dev_net_set(new_dev, net);
/* need 4 bytes for extra VLAN header info,
* hope the underlying device can handle it.
@@ -428,7 +385,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
dev->netdev_ops->ndo_vlan_rx_add_vid(dev, 0);
}
- grp = __vlan_find_group(dev);
+ grp = dev->vlgrp;
if (!grp)
goto out;
@@ -439,7 +396,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
switch (event) {
case NETDEV_CHANGE:
/* Propagate real device state to vlan devices */
- for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+ for (i = 0; i < VLAN_N_VID; i++) {
vlandev = vlan_group_get_device(grp, i);
if (!vlandev)
continue;
@@ -450,7 +407,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
case NETDEV_CHANGEADDR:
/* Adjust unicast filters on underlying device */
- for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+ for (i = 0; i < VLAN_N_VID; i++) {
vlandev = vlan_group_get_device(grp, i);
if (!vlandev)
continue;
@@ -464,7 +421,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
break;
case NETDEV_CHANGEMTU:
- for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+ for (i = 0; i < VLAN_N_VID; i++) {
vlandev = vlan_group_get_device(grp, i);
if (!vlandev)
continue;
@@ -478,7 +435,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
case NETDEV_FEAT_CHANGE:
/* Propagate device features to underlying device */
- for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+ for (i = 0; i < VLAN_N_VID; i++) {
vlandev = vlan_group_get_device(grp, i);
if (!vlandev)
continue;
@@ -490,7 +447,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
case NETDEV_DOWN:
/* Put all VLANs for this dev in the down state too. */
- for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+ for (i = 0; i < VLAN_N_VID; i++) {
vlandev = vlan_group_get_device(grp, i);
if (!vlandev)
continue;
@@ -508,7 +465,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
case NETDEV_UP:
/* Put all VLANs for this dev in the up state too. */
- for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+ for (i = 0; i < VLAN_N_VID; i++) {
vlandev = vlan_group_get_device(grp, i);
if (!vlandev)
continue;
@@ -525,10 +482,14 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
break;
case NETDEV_UNREGISTER:
+ /* twiddle thumbs on netns device moves */
+ if (dev->reg_state != NETREG_UNREGISTERING)
+ break;
+
/* Delete all VLANs for this dev. */
grp->killall = 1;
- for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+ for (i = 0; i < VLAN_N_VID; i++) {
vlandev = vlan_group_get_device(grp, i);
if (!vlandev)
continue;
@@ -536,7 +497,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
/* unregistration of last vlan destroys group, abort
* afterwards */
if (grp->nr_vlans == 1)
- i = VLAN_GROUP_ARRAY_LEN;
+ i = VLAN_N_VID;
unregister_vlan_dev(vlandev, &list);
}
@@ -742,8 +703,6 @@ err0:
static void __exit vlan_cleanup_module(void)
{
- unsigned int i;
-
vlan_ioctl_set(NULL);
vlan_netlink_fini();
@@ -751,10 +710,6 @@ static void __exit vlan_cleanup_module(void)
dev_remove_pack(&vlan_packet_type);
- /* This table must be empty if there are no module references left. */
- for (i = 0; i < VLAN_GRP_HASH_SIZE; i++)
- BUG_ON(!hlist_empty(&vlan_group_hash[i]));
-
unregister_pernet_subsys(&vlan_net_ops);
rcu_barrier(); /* Wait for completion of call_rcu()'s */
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 8d9503ad01d..db01b3181fd 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -72,23 +72,6 @@ static inline struct vlan_dev_info *vlan_dev_info(const struct net_device *dev)
return netdev_priv(dev);
}
-#define VLAN_GRP_HASH_SHIFT 5
-#define VLAN_GRP_HASH_SIZE (1 << VLAN_GRP_HASH_SHIFT)
-#define VLAN_GRP_HASH_MASK (VLAN_GRP_HASH_SIZE - 1)
-
-/* Find a VLAN device by the MAC address of its Ethernet device, and
- * it's VLAN ID. The default configuration is to have VLAN's scope
- * to be box-wide, so the MAC will be ignored. The mac will only be
- * looked at if we are configured to have a separate set of VLANs per
- * each MAC addressable interface. Note that this latter option does
- * NOT follow the spec for VLANs, but may be useful for doing very
- * large quantities of VLAN MUX/DEMUX onto FrameRelay or ATM PVCs.
- *
- * Must be invoked with rcu_read_lock (ie preempt disabled)
- * or with RTNL.
- */
-struct net_device *__find_vlan_dev(struct net_device *real_dev, u16 vlan_id);
-
/* found in vlan_dev.c */
int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *ptype, struct net_device *orig_dev);
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 01ddb0472f8..69b2f79800a 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -4,50 +4,29 @@
#include <linux/netpoll.h>
#include "vlan.h"
-/* VLAN rx hw acceleration helper. This acts like netif_{rx,receive_skb}(). */
-int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
- u16 vlan_tci, int polling)
+bool vlan_hwaccel_do_receive(struct sk_buff **skbp)
{
+ struct sk_buff *skb = *skbp;
+ u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK;
struct net_device *vlan_dev;
- u16 vlan_id;
-
- if (netpoll_rx(skb))
- return NET_RX_DROP;
-
- if (skb_bond_should_drop(skb, ACCESS_ONCE(skb->dev->master)))
- skb->deliver_no_wcard = 1;
-
- skb->skb_iif = skb->dev->ifindex;
- __vlan_hwaccel_put_tag(skb, vlan_tci);
- vlan_id = vlan_tci & VLAN_VID_MASK;
- vlan_dev = vlan_group_get_device(grp, vlan_id);
-
- if (vlan_dev)
- skb->dev = vlan_dev;
- else if (vlan_id)
- goto drop;
-
- return (polling ? netif_receive_skb(skb) : netif_rx(skb));
+ struct vlan_rx_stats *rx_stats;
-drop:
- dev_kfree_skb_any(skb);
- return NET_RX_DROP;
-}
-EXPORT_SYMBOL(__vlan_hwaccel_rx);
-
-int vlan_hwaccel_do_receive(struct sk_buff *skb)
-{
- struct net_device *dev = skb->dev;
- struct vlan_rx_stats *rx_stats;
+ vlan_dev = vlan_find_dev(skb->dev, vlan_id);
+ if (!vlan_dev) {
+ if (vlan_id)
+ skb->pkt_type = PACKET_OTHERHOST;
+ return false;
+ }
- skb->dev = vlan_dev_info(dev)->real_dev;
- netif_nit_deliver(skb);
+ skb = *skbp = skb_share_check(skb, GFP_ATOMIC);
+ if (unlikely(!skb))
+ return false;
- skb->dev = dev;
- skb->priority = vlan_get_ingress_priority(dev, skb->vlan_tci);
+ skb->dev = vlan_dev;
+ skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci);
skb->vlan_tci = 0;
- rx_stats = this_cpu_ptr(vlan_dev_info(dev)->vlan_rx_stats);
+ rx_stats = this_cpu_ptr(vlan_dev_info(vlan_dev)->vlan_rx_stats);
u64_stats_update_begin(&rx_stats->syncp);
rx_stats->rx_packets++;
@@ -64,12 +43,13 @@ int vlan_hwaccel_do_receive(struct sk_buff *skb)
* This allows the VLAN to have a different MAC than the
* underlying device, and still route correctly. */
if (!compare_ether_addr(eth_hdr(skb)->h_dest,
- dev->dev_addr))
+ vlan_dev->dev_addr))
skb->pkt_type = PACKET_HOST;
break;
}
u64_stats_update_end(&rx_stats->syncp);
- return 0;
+
+ return true;
}
struct net_device *vlan_dev_real_dev(const struct net_device *dev)
@@ -84,68 +64,27 @@ u16 vlan_dev_vlan_id(const struct net_device *dev)
}
EXPORT_SYMBOL(vlan_dev_vlan_id);
-static gro_result_t
-vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
- unsigned int vlan_tci, struct sk_buff *skb)
+/* VLAN rx hw acceleration helper. This acts like netif_{rx,receive_skb}(). */
+int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
+ u16 vlan_tci, int polling)
{
- struct sk_buff *p;
- struct net_device *vlan_dev;
- u16 vlan_id;
-
- if (skb_bond_should_drop(skb, ACCESS_ONCE(skb->dev->master)))
- skb->deliver_no_wcard = 1;
-
- skb->skb_iif = skb->dev->ifindex;
__vlan_hwaccel_put_tag(skb, vlan_tci);
- vlan_id = vlan_tci & VLAN_VID_MASK;
- vlan_dev = vlan_group_get_device(grp, vlan_id);
-
- if (vlan_dev)
- skb->dev = vlan_dev;
- else if (vlan_id)
- goto drop;
-
- for (p = napi->gro_list; p; p = p->next) {
- NAPI_GRO_CB(p)->same_flow =
- p->dev == skb->dev && !compare_ether_header(
- skb_mac_header(p), skb_gro_mac_header(skb));
- NAPI_GRO_CB(p)->flush = 0;
- }
-
- return dev_gro_receive(napi, skb);
-
-drop:
- return GRO_DROP;
+ return polling ? netif_receive_skb(skb) : netif_rx(skb);
}
+EXPORT_SYMBOL(__vlan_hwaccel_rx);
gro_result_t vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
unsigned int vlan_tci, struct sk_buff *skb)
{
- if (netpoll_rx_on(skb))
- return vlan_hwaccel_receive_skb(skb, grp, vlan_tci)
- ? GRO_DROP : GRO_NORMAL;
-
- skb_gro_reset_offset(skb);
-
- return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb);
+ __vlan_hwaccel_put_tag(skb, vlan_tci);
+ return napi_gro_receive(napi, skb);
}
EXPORT_SYMBOL(vlan_gro_receive);
gro_result_t vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
unsigned int vlan_tci)
{
- struct sk_buff *skb = napi_frags_skb(napi);
-
- if (!skb)
- return GRO_DROP;
-
- if (netpoll_rx_on(skb)) {
- skb->protocol = eth_type_trans(skb, skb->dev);
- return vlan_hwaccel_receive_skb(skb, grp, vlan_tci)
- ? GRO_DROP : GRO_NORMAL;
- }
-
- return napi_frags_finish(napi, skb,
- vlan_gro_common(napi, grp, vlan_tci, skb));
+ __vlan_hwaccel_put_tag(napi->skb, vlan_tci);
+ return napi_gro_frags(napi);
}
EXPORT_SYMBOL(vlan_gro_frags);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 3d59c9bf8fe..14e3d1fa07a 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -158,7 +158,7 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
vlan_id = vlan_tci & VLAN_VID_MASK;
rcu_read_lock();
- vlan_dev = __find_vlan_dev(dev, vlan_id);
+ vlan_dev = vlan_find_dev(dev, vlan_id);
/* If the VLAN device is defined, we use it.
* If not, and the VID is 0, it is a 802.1p packet (not
@@ -177,8 +177,8 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
} else {
skb->dev = vlan_dev;
- rx_stats = per_cpu_ptr(vlan_dev_info(skb->dev)->vlan_rx_stats,
- smp_processor_id());
+ rx_stats = this_cpu_ptr(vlan_dev_info(skb->dev)->vlan_rx_stats);
+
u64_stats_update_begin(&rx_stats->syncp);
rx_stats->rx_packets++;
rx_stats->rx_bytes += skb->len;
@@ -226,12 +226,14 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
}
netif_rx(skb);
+
rcu_read_unlock();
return NET_RX_SUCCESS;
err_unlock:
rcu_read_unlock();
err_free:
+ atomic_long_inc(&dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
@@ -510,7 +512,8 @@ static int vlan_dev_open(struct net_device *dev)
if (vlan->flags & VLAN_FLAG_GVRP)
vlan_gvrp_request_join(dev);
- netif_carrier_on(dev);
+ if (netif_carrier_ok(real_dev))
+ netif_carrier_on(dev);
return 0;
clear_allmulti:
@@ -842,7 +845,7 @@ static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, st
accum.rx_packets += rxpackets;
accum.rx_bytes += rxbytes;
accum.rx_multicast += rxmulticast;
- /* rx_errors is an ulong, not protected by syncp */
+ /* rx_errors is ulong, not protected by syncp */
accum.rx_errors += p->rx_errors;
}
stats->rx_packets = accum.rx_packets;
diff --git a/net/9p/client.c b/net/9p/client.c
index dc6f2f26d02..83bf0541d66 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -61,13 +61,13 @@ static const match_table_t tokens = {
inline int p9_is_proto_dotl(struct p9_client *clnt)
{
- return (clnt->proto_version == p9_proto_2000L);
+ return clnt->proto_version == p9_proto_2000L;
}
EXPORT_SYMBOL(p9_is_proto_dotl);
inline int p9_is_proto_dotu(struct p9_client *clnt)
{
- return (clnt->proto_version == p9_proto_2000u);
+ return clnt->proto_version == p9_proto_2000u;
}
EXPORT_SYMBOL(p9_is_proto_dotu);
@@ -331,8 +331,10 @@ static void p9_tag_cleanup(struct p9_client *c)
}
}
- if (c->tagpool)
+ if (c->tagpool) {
+ p9_idpool_put(0, c->tagpool); /* free reserved tag 0 */
p9_idpool_destroy(c->tagpool);
+ }
/* free requests associated with tags */
for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) {
@@ -669,7 +671,7 @@ static void p9_fid_destroy(struct p9_fid *fid)
kfree(fid);
}
-int p9_client_version(struct p9_client *c)
+static int p9_client_version(struct p9_client *c)
{
int err = 0;
struct p9_req_t *req;
@@ -728,7 +730,6 @@ error:
return err;
}
-EXPORT_SYMBOL(p9_client_version);
struct p9_client *p9_client_create(const char *dev_name, char *options)
{
@@ -885,54 +886,6 @@ error:
}
EXPORT_SYMBOL(p9_client_attach);
-struct p9_fid *
-p9_client_auth(struct p9_client *clnt, char *uname, u32 n_uname, char *aname)
-{
- int err;
- struct p9_req_t *req;
- struct p9_qid qid;
- struct p9_fid *afid;
-
- P9_DPRINTK(P9_DEBUG_9P, ">>> TAUTH uname %s aname %s\n", uname, aname);
- err = 0;
-
- afid = p9_fid_create(clnt);
- if (IS_ERR(afid)) {
- err = PTR_ERR(afid);
- afid = NULL;
- goto error;
- }
-
- req = p9_client_rpc(clnt, P9_TAUTH, "dss?d",
- afid ? afid->fid : P9_NOFID, uname, aname, n_uname);
- if (IS_ERR(req)) {
- err = PTR_ERR(req);
- goto error;
- }
-
- err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid);
- if (err) {
- p9pdu_dump(1, req->rc);
- p9_free_req(clnt, req);
- goto error;
- }
-
- P9_DPRINTK(P9_DEBUG_9P, "<<< RAUTH qid %x.%llx.%x\n",
- qid.type,
- (unsigned long long)qid.path,
- qid.version);
-
- memmove(&afid->qid, &qid, sizeof(struct p9_qid));
- p9_free_req(clnt, req);
- return afid;
-
-error:
- if (afid)
- p9_fid_destroy(afid);
- return ERR_PTR(err);
-}
-EXPORT_SYMBOL(p9_client_auth);
-
struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames,
int clone)
{
@@ -944,6 +897,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames,
int16_t nwqids, count;
err = 0;
+ wqids = NULL;
clnt = oldfid->clnt;
if (clone) {
fid = p9_fid_create(clnt);
@@ -994,9 +948,11 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames,
else
fid->qid = oldfid->qid;
+ kfree(wqids);
return fid;
clunk_fid:
+ kfree(wqids);
p9_client_clunk(fid);
fid = NULL;
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index c85109d809c..078eb162d9b 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -222,7 +222,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
}
}
-static unsigned int
+static int
p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt)
{
int ret, n;
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 0ea20c30466..17c5ba7551a 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -426,8 +426,10 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
/* Allocate an fcall for the reply */
rpl_context = kmalloc(sizeof *rpl_context, GFP_KERNEL);
- if (!rpl_context)
+ if (!rpl_context) {
+ err = -ENOMEM;
goto err_close;
+ }
/*
* If the request has a buffer, steal it, otherwise
@@ -445,8 +447,8 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
}
rpl_context->rc = req->rc;
if (!rpl_context->rc) {
- kfree(rpl_context);
- goto err_close;
+ err = -ENOMEM;
+ goto err_free2;
}
/*
@@ -458,11 +460,8 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
*/
if (atomic_inc_return(&rdma->rq_count) <= rdma->rq_depth) {
err = post_recv(client, rpl_context);
- if (err) {
- kfree(rpl_context->rc);
- kfree(rpl_context);
- goto err_close;
- }
+ if (err)
+ goto err_free1;
} else
atomic_dec(&rdma->rq_count);
@@ -471,8 +470,10 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
/* Post the request */
c = kmalloc(sizeof *c, GFP_KERNEL);
- if (!c)
- goto err_close;
+ if (!c) {
+ err = -ENOMEM;
+ goto err_free1;
+ }
c->req = req;
c->busa = ib_dma_map_single(rdma->cm_id->device,
@@ -499,9 +500,15 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
return ib_post_send(rdma->qp, &wr, &bad_wr);
error:
+ kfree(c);
+ kfree(rpl_context->rc);
+ kfree(rpl_context);
P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n");
return -EIO;
-
+ err_free1:
+ kfree(rpl_context->rc);
+ err_free2:
+ kfree(rpl_context);
err_close:
spin_lock_irqsave(&rdma->req_lock, flags);
if (rdma->state < P9_RDMA_CLOSING) {
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index dcfbe99ff81..b88515936e4 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -329,7 +329,8 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args)
mutex_lock(&virtio_9p_lock);
list_for_each_entry(chan, &virtio_chan_list, chan_list) {
- if (!strncmp(devname, chan->tag, chan->tag_len)) {
+ if (!strncmp(devname, chan->tag, chan->tag_len) &&
+ strlen(devname) == chan->tag_len) {
if (!chan->inuse) {
chan->inuse = true;
found = 1;
diff --git a/net/Kconfig b/net/Kconfig
index e330594d370..55fd82e9ffd 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -217,7 +217,7 @@ source "net/dns_resolver/Kconfig"
config RPS
boolean
- depends on SMP && SYSFS
+ depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
default y
menu "Network testing"
@@ -293,6 +293,7 @@ source "net/wimax/Kconfig"
source "net/rfkill/Kconfig"
source "net/9p/Kconfig"
source "net/caif/Kconfig"
+source "net/ceph/Kconfig"
endif # if NET
diff --git a/net/Makefile b/net/Makefile
index ea60fbce9b1..6b7bfd7f141 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -68,3 +68,4 @@ obj-$(CONFIG_SYSCTL) += sysctl_net.o
endif
obj-$(CONFIG_WIMAX) += wimax/
obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/
+obj-$(CONFIG_CEPH_LIB) += ceph/
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index 651babdfab3..ad2b232a205 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -399,12 +399,6 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
unregister_netdev(net_dev);
free_netdev(net_dev);
}
- read_lock_irq(&devs_lock);
- if (list_empty(&br2684_devs)) {
- /* last br2684 device */
- unregister_atmdevice_notifier(&atm_dev_notifier);
- }
- read_unlock_irq(&devs_lock);
return;
}
@@ -675,7 +669,6 @@ static int br2684_create(void __user *arg)
if (list_empty(&br2684_devs)) {
/* 1st br2684 device */
- register_atmdevice_notifier(&atm_dev_notifier);
brdev->number = 1;
} else
brdev->number = BRPRIV(list_entry_brdev(br2684_devs.prev))->number + 1;
@@ -815,6 +808,7 @@ static int __init br2684_init(void)
return -ENOMEM;
#endif
register_atm_ioctl(&br2684_ioctl_ops);
+ register_atmdevice_notifier(&atm_dev_notifier);
return 0;
}
@@ -830,9 +824,7 @@ static void __exit br2684_exit(void)
#endif
- /* if not already empty */
- if (!list_empty(&br2684_devs))
- unregister_atmdevice_notifier(&atm_dev_notifier);
+ unregister_atmdevice_notifier(&atm_dev_notifier);
while (!list_empty(&br2684_devs)) {
net_dev = list_entry_brdev(br2684_devs.next);
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 95fdd118506..ff956d1115b 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -310,9 +310,9 @@ static int clip_constructor(struct neighbour *neigh)
return 0;
}
-static u32 clip_hash(const void *pkey, const struct net_device *dev)
+static u32 clip_hash(const void *pkey, const struct net_device *dev, __u32 rnd)
{
- return jhash_2words(*(u32 *) pkey, dev->ifindex, clip_tbl.hash_rnd);
+ return jhash_2words(*(u32 *) pkey, dev->ifindex, rnd);
}
static struct neigh_table clip_tbl = {
diff --git a/net/atm/common.c b/net/atm/common.c
index 940404a73b3..1b9c52a02cd 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -792,7 +792,7 @@ int vcc_getsockopt(struct socket *sock, int level, int optname,
default:
if (level == SOL_SOCKET)
return -EINVAL;
- break;
+ break;
}
if (!vcc->dev || !vcc->dev->ops->getsockopt)
return -EINVAL;
diff --git a/net/atm/lec.c b/net/atm/lec.c
index d98bde1a0ac..181d70c73d7 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -220,7 +220,6 @@ static unsigned char *get_tr_dst(unsigned char *packet, unsigned char *rdesc)
static int lec_open(struct net_device *dev)
{
netif_start_queue(dev);
- memset(&dev->stats, 0, sizeof(struct net_device_stats));
return 0;
}
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 622b471e14e..74bcc662c3d 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -778,7 +778,7 @@ static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb)
eg->packets_rcvd++;
mpc->eg_ops->put(eg);
- memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data));
+ memset(ATM_SKB(new_skb), 0, sizeof(struct atm_skb_data));
netif_rx(new_skb);
}
diff --git a/net/atm/proc.c b/net/atm/proc.c
index 6262aeae398..f85da0779e5 100644
--- a/net/atm/proc.c
+++ b/net/atm/proc.c
@@ -38,6 +38,7 @@ static ssize_t proc_dev_atm_read(struct file *file, char __user *buf,
static const struct file_operations proc_atm_dev_ops = {
.owner = THIS_MODULE,
.read = proc_dev_atm_read,
+ .llseek = noop_llseek,
};
static void add_stats(struct seq_file *seq, const char *aal,
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index cfdfd7e2a17..26eaebf4aaa 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1103,7 +1103,7 @@ done:
out:
release_sock(sk);
- return 0;
+ return err;
}
/*
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
index 2ce79df0068..c7d81436213 100644
--- a/net/ax25/ax25_ds_timer.c
+++ b/net/ax25/ax25_ds_timer.c
@@ -112,8 +112,8 @@ void ax25_ds_heartbeat_expiry(ax25_cb *ax25)
if (sk) {
sock_hold(sk);
ax25_destroy_socket(ax25);
- sock_put(sk);
bh_unlock_sock(sk);
+ sock_put(sk);
} else
ax25_destroy_socket(ax25);
return;
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index 7805945a5fd..a1690845dc6 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -412,7 +412,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
{
ax25_uid_assoc *user;
ax25_route *ax25_rt;
- int err;
+ int err = 0;
if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL)
return -EHOSTUNREACH;
@@ -453,7 +453,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
put:
ax25_put_route(ax25_rt);
- return 0;
+ return err;
}
struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src,
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 421c45bd1b9..c4cf3f59500 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -265,6 +265,115 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
}
EXPORT_SYMBOL(bt_sock_recvmsg);
+static long bt_sock_data_wait(struct sock *sk, long timeo)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ break;
+
+ if (sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN))
+ break;
+
+ if (signal_pending(current) || !timeo)
+ break;
+
+ set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ }
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return timeo;
+}
+
+int bt_sock_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
+ struct msghdr *msg, size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+ size_t target, copied = 0;
+ long timeo;
+
+ if (flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ msg->msg_namelen = 0;
+
+ BT_DBG("sk %p size %zu", sk, size);
+
+ lock_sock(sk);
+
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+ do {
+ struct sk_buff *skb;
+ int chunk;
+
+ skb = skb_dequeue(&sk->sk_receive_queue);
+ if (!skb) {
+ if (copied >= target)
+ break;
+
+ if ((err = sock_error(sk)) != 0)
+ break;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+
+ err = -EAGAIN;
+ if (!timeo)
+ break;
+
+ timeo = bt_sock_data_wait(sk, timeo);
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ goto out;
+ }
+ continue;
+ }
+
+ chunk = min_t(unsigned int, skb->len, size);
+ if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ if (!copied)
+ copied = -EFAULT;
+ break;
+ }
+ copied += chunk;
+ size -= chunk;
+
+ sock_recv_ts_and_drops(msg, sk, skb);
+
+ if (!(flags & MSG_PEEK)) {
+ skb_pull(skb, chunk);
+ if (skb->len) {
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ break;
+ }
+ kfree_skb(skb);
+
+ } else {
+ /* put message back and return */
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ break;
+ }
+ } while (size);
+
+out:
+ release_sock(sk);
+ return copied ? : err;
+}
+EXPORT_SYMBOL(bt_sock_stream_recvmsg);
+
static inline unsigned int bt_accept_poll(struct sock *parent)
{
struct list_head *p, *n;
@@ -297,13 +406,12 @@ unsigned int bt_sock_poll(struct file * file, struct socket *sock, poll_table *w
mask |= POLLERR;
if (sk->sk_shutdown & RCV_SHUTDOWN)
- mask |= POLLRDHUP;
+ mask |= POLLRDHUP | POLLIN | POLLRDNORM;
if (sk->sk_shutdown == SHUTDOWN_MASK)
mask |= POLLHUP;
- if (!skb_queue_empty(&sk->sk_receive_queue) ||
- (sk->sk_shutdown & RCV_SHUTDOWN))
+ if (!skb_queue_empty(&sk->sk_receive_queue))
mask |= POLLIN | POLLRDNORM;
if (sk->sk_state == BT_CLOSED)
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index d4c6af082d4..ec0a1347f93 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -321,14 +321,10 @@ static int cmtp_session(void *arg)
int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock)
{
struct cmtp_session *session, *s;
- bdaddr_t src, dst;
int i, err;
BT_DBG("");
- baswap(&src, &bt_sk(sock->sk)->src);
- baswap(&dst, &bt_sk(sock->sk)->dst);
-
session = kzalloc(sizeof(struct cmtp_session), GFP_KERNEL);
if (!session)
return -ENOMEM;
@@ -347,7 +343,7 @@ int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock)
BT_DBG("mtu %d", session->mtu);
- sprintf(session->name, "%s", batostr(&dst));
+ sprintf(session->name, "%s", batostr(&bt_sk(sock->sk)->dst));
session->sock = sock;
session->state = BT_CONFIG;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index c52f091ee6d..bc2a052e518 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -562,7 +562,6 @@ static int hci_dev_do_close(struct hci_dev *hdev)
hci_dev_lock_bh(hdev);
inquiry_cache_flush(hdev);
hci_conn_hash_flush(hdev);
- hci_blacklist_clear(hdev);
hci_dev_unlock_bh(hdev);
hci_notify(hdev, HCI_DEV_DOWN);
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 8fb967beee8..5fce3d6d07b 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -37,9 +37,7 @@ static ssize_t show_link_type(struct device *dev, struct device_attribute *attr,
static ssize_t show_link_address(struct device *dev, struct device_attribute *attr, char *buf)
{
struct hci_conn *conn = dev_get_drvdata(dev);
- bdaddr_t bdaddr;
- baswap(&bdaddr, &conn->dst);
- return sprintf(buf, "%s\n", batostr(&bdaddr));
+ return sprintf(buf, "%s\n", batostr(&conn->dst));
}
static ssize_t show_link_features(struct device *dev, struct device_attribute *attr, char *buf)
@@ -196,8 +194,8 @@ static inline char *host_typetostr(int type)
switch (type) {
case HCI_BREDR:
return "BR/EDR";
- case HCI_80211:
- return "802.11";
+ case HCI_AMP:
+ return "AMP";
default:
return "UNKNOWN";
}
@@ -238,9 +236,7 @@ static ssize_t show_class(struct device *dev, struct device_attribute *attr, cha
static ssize_t show_address(struct device *dev, struct device_attribute *attr, char *buf)
{
struct hci_dev *hdev = dev_get_drvdata(dev);
- bdaddr_t bdaddr;
- baswap(&bdaddr, &hdev->bdaddr);
- return sprintf(buf, "%s\n", batostr(&bdaddr));
+ return sprintf(buf, "%s\n", batostr(&hdev->bdaddr));
}
static ssize_t show_features(struct device *dev, struct device_attribute *attr, char *buf)
@@ -408,10 +404,8 @@ static int inquiry_cache_show(struct seq_file *f, void *p)
for (e = cache->list; e; e = e->next) {
struct inquiry_data *data = &e->data;
- bdaddr_t bdaddr;
- baswap(&bdaddr, &data->bdaddr);
seq_printf(f, "%s %d %d %d 0x%.2x%.2x%.2x 0x%.4x %d %d %u\n",
- batostr(&bdaddr),
+ batostr(&data->bdaddr),
data->pscan_rep_mode, data->pscan_period_mode,
data->pscan_mode, data->dev_class[2],
data->dev_class[1], data->dev_class[0],
@@ -445,13 +439,10 @@ static int blacklist_show(struct seq_file *f, void *p)
list_for_each(l, &hdev->blacklist) {
struct bdaddr_list *b;
- bdaddr_t bdaddr;
b = list_entry(l, struct bdaddr_list, list);
- baswap(&bdaddr, &b->bdaddr);
-
- seq_printf(f, "%s\n", batostr(&bdaddr));
+ seq_printf(f, "%s\n", batostr(&b->bdaddr));
}
hci_dev_unlock_bh(hdev);
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index bfe641b7dfa..c0ee8b3928e 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -758,7 +758,6 @@ static int hidp_setup_hid(struct hidp_session *session,
struct hidp_connadd_req *req)
{
struct hid_device *hid;
- bdaddr_t src, dst;
int err;
session->rd_data = kzalloc(req->rd_size, GFP_KERNEL);
@@ -781,9 +780,6 @@ static int hidp_setup_hid(struct hidp_session *session,
hid->driver_data = session;
- baswap(&src, &bt_sk(session->ctrl_sock->sk)->src);
- baswap(&dst, &bt_sk(session->ctrl_sock->sk)->dst);
-
hid->bus = BUS_BLUETOOTH;
hid->vendor = req->vendor;
hid->product = req->product;
@@ -791,8 +787,8 @@ static int hidp_setup_hid(struct hidp_session *session,
hid->country = req->country;
strncpy(hid->name, req->name, 128);
- strncpy(hid->phys, batostr(&src), 64);
- strncpy(hid->uniq, batostr(&dst), 64);
+ strncpy(hid->phys, batostr(&bt_sk(session->ctrl_sock->sk)->src), 64);
+ strncpy(hid->uniq, batostr(&bt_sk(session->ctrl_sock->sk)->dst), 64);
hid->dev.parent = hidp_get_device(session);
hid->ll_driver = &hidp_hid_driver;
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index 3e3cd9d4e52..daa7a988d9a 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -1008,10 +1008,20 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
goto done;
}
- if (la.l2_psm && __le16_to_cpu(la.l2_psm) < 0x1001 &&
- !capable(CAP_NET_BIND_SERVICE)) {
- err = -EACCES;
- goto done;
+ if (la.l2_psm) {
+ __u16 psm = __le16_to_cpu(la.l2_psm);
+
+ /* PSM must be odd and lsb of upper byte must be 0 */
+ if ((psm & 0x0101) != 0x0001) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ /* Restrict usage of well-known PSMs */
+ if (psm < 0x1001 && !capable(CAP_NET_BIND_SERVICE)) {
+ err = -EACCES;
+ goto done;
+ }
}
write_lock_bh(&l2cap_sk_list.lock);
@@ -1190,6 +1200,13 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int al
goto done;
}
+ /* PSM must be odd and lsb of upper byte must be 0 */
+ if ((__le16_to_cpu(la.l2_psm) & 0x0101) != 0x0001 &&
+ sk->sk_type != SOCK_RAW) {
+ err = -EINVAL;
+ goto done;
+ }
+
/* Set destination address and psm */
bacpy(&bt_sk(sk)->dst, &la.l2_bdaddr);
l2cap_pi(sk)->psm = la.l2_psm;
@@ -1441,33 +1458,23 @@ static inline void l2cap_do_send(struct sock *sk, struct sk_buff *skb)
static void l2cap_streaming_send(struct sock *sk)
{
- struct sk_buff *skb, *tx_skb;
+ struct sk_buff *skb;
struct l2cap_pinfo *pi = l2cap_pi(sk);
u16 control, fcs;
- while ((skb = sk->sk_send_head)) {
- tx_skb = skb_clone(skb, GFP_ATOMIC);
-
- control = get_unaligned_le16(tx_skb->data + L2CAP_HDR_SIZE);
+ while ((skb = skb_dequeue(TX_QUEUE(sk)))) {
+ control = get_unaligned_le16(skb->data + L2CAP_HDR_SIZE);
control |= pi->next_tx_seq << L2CAP_CTRL_TXSEQ_SHIFT;
- put_unaligned_le16(control, tx_skb->data + L2CAP_HDR_SIZE);
+ put_unaligned_le16(control, skb->data + L2CAP_HDR_SIZE);
if (pi->fcs == L2CAP_FCS_CRC16) {
- fcs = crc16(0, (u8 *)tx_skb->data, tx_skb->len - 2);
- put_unaligned_le16(fcs, tx_skb->data + tx_skb->len - 2);
+ fcs = crc16(0, (u8 *)skb->data, skb->len - 2);
+ put_unaligned_le16(fcs, skb->data + skb->len - 2);
}
- l2cap_do_send(sk, tx_skb);
+ l2cap_do_send(sk, skb);
pi->next_tx_seq = (pi->next_tx_seq + 1) % 64;
-
- if (skb_queue_is_last(TX_QUEUE(sk), skb))
- sk->sk_send_head = NULL;
- else
- sk->sk_send_head = skb_queue_next(TX_QUEUE(sk), skb);
-
- skb = skb_dequeue(TX_QUEUE(sk));
- kfree_skb(skb);
}
}
@@ -1645,7 +1652,7 @@ static inline int l2cap_skbuff_fromiovec(struct sock *sk, struct msghdr *msg, in
*frag = bt_skb_send_alloc(sk, count, msg->msg_flags & MSG_DONTWAIT, &err);
if (!*frag)
- return -EFAULT;
+ return err;
if (memcpy_fromiovec(skb_put(*frag, count), msg->msg_iov, count))
return -EFAULT;
@@ -1671,7 +1678,7 @@ static struct sk_buff *l2cap_create_connless_pdu(struct sock *sk, struct msghdr
skb = bt_skb_send_alloc(sk, count + hlen,
msg->msg_flags & MSG_DONTWAIT, &err);
if (!skb)
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(err);
/* Create L2CAP header */
lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
@@ -1700,7 +1707,7 @@ static struct sk_buff *l2cap_create_basic_pdu(struct sock *sk, struct msghdr *ms
skb = bt_skb_send_alloc(sk, count + hlen,
msg->msg_flags & MSG_DONTWAIT, &err);
if (!skb)
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(err);
/* Create L2CAP header */
lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
@@ -1737,7 +1744,7 @@ static struct sk_buff *l2cap_create_iframe_pdu(struct sock *sk, struct msghdr *m
skb = bt_skb_send_alloc(sk, count + hlen,
msg->msg_flags & MSG_DONTWAIT, &err);
if (!skb)
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(err);
/* Create L2CAP header */
lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
@@ -1944,6 +1951,9 @@ static int l2cap_sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct ms
release_sock(sk);
+ if (sock->type == SOCK_STREAM)
+ return bt_sock_stream_recvmsg(iocb, sock, msg, len, flags);
+
return bt_sock_recvmsg(iocb, sock, msg, len, flags);
}
@@ -1960,6 +1970,11 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, char __us
switch (optname) {
case L2CAP_OPTIONS:
+ if (sk->sk_state == BT_CONNECTED) {
+ err = -EINVAL;
+ break;
+ }
+
opts.imtu = l2cap_pi(sk)->imtu;
opts.omtu = l2cap_pi(sk)->omtu;
opts.flush_to = l2cap_pi(sk)->flush_to;
@@ -2705,8 +2720,9 @@ done:
case L2CAP_MODE_ERTM:
pi->remote_tx_win = rfc.txwin_size;
pi->remote_max_tx = rfc.max_transmit;
- if (rfc.max_pdu_size > pi->conn->mtu - 10)
- rfc.max_pdu_size = le16_to_cpu(pi->conn->mtu - 10);
+
+ if (le16_to_cpu(rfc.max_pdu_size) > pi->conn->mtu - 10)
+ rfc.max_pdu_size = cpu_to_le16(pi->conn->mtu - 10);
pi->remote_mps = le16_to_cpu(rfc.max_pdu_size);
@@ -2723,8 +2739,8 @@ done:
break;
case L2CAP_MODE_STREAMING:
- if (rfc.max_pdu_size > pi->conn->mtu - 10)
- rfc.max_pdu_size = le16_to_cpu(pi->conn->mtu - 10);
+ if (le16_to_cpu(rfc.max_pdu_size) > pi->conn->mtu - 10)
+ rfc.max_pdu_size = cpu_to_le16(pi->conn->mtu - 10);
pi->remote_mps = le16_to_cpu(rfc.max_pdu_size);
@@ -2770,10 +2786,10 @@ static int l2cap_parse_conf_rsp(struct sock *sk, void *rsp, int len, void *data,
case L2CAP_CONF_MTU:
if (val < L2CAP_DEFAULT_MIN_MTU) {
*result = L2CAP_CONF_UNACCEPT;
- pi->omtu = L2CAP_DEFAULT_MIN_MTU;
+ pi->imtu = L2CAP_DEFAULT_MIN_MTU;
} else
- pi->omtu = val;
- l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, pi->omtu);
+ pi->imtu = val;
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, pi->imtu);
break;
case L2CAP_CONF_FLUSH_TO:
@@ -2806,7 +2822,6 @@ static int l2cap_parse_conf_rsp(struct sock *sk, void *rsp, int len, void *data,
if (*result == L2CAP_CONF_SUCCESS) {
switch (rfc.mode) {
case L2CAP_MODE_ERTM:
- pi->remote_tx_win = rfc.txwin_size;
pi->retrans_timeout = le16_to_cpu(rfc.retrans_timeout);
pi->monitor_timeout = le16_to_cpu(rfc.monitor_timeout);
pi->mps = le16_to_cpu(rfc.max_pdu_size);
@@ -2862,7 +2877,6 @@ static void l2cap_conf_rfc_get(struct sock *sk, void *rsp, int len)
done:
switch (rfc.mode) {
case L2CAP_MODE_ERTM:
- pi->remote_tx_win = rfc.txwin_size;
pi->retrans_timeout = le16_to_cpu(rfc.retrans_timeout);
pi->monitor_timeout = le16_to_cpu(rfc.monitor_timeout);
pi->mps = le16_to_cpu(rfc.max_pdu_size);
@@ -2897,7 +2911,7 @@ static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hd
struct l2cap_chan_list *list = &conn->chan_list;
struct l2cap_conn_req *req = (struct l2cap_conn_req *) data;
struct l2cap_conn_rsp rsp;
- struct sock *parent, *uninitialized_var(sk);
+ struct sock *parent, *sk = NULL;
int result, status = L2CAP_CS_NO_INFO;
u16 dcid = 0, scid = __le16_to_cpu(req->scid);
@@ -3006,7 +3020,7 @@ sendresp:
L2CAP_INFO_REQ, sizeof(info), &info);
}
- if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_REQ_SENT) &&
+ if (sk && !(l2cap_pi(sk)->conf_state & L2CAP_CONF_REQ_SENT) &&
result == L2CAP_CR_SUCCESS) {
u8 buf[128];
l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT;
@@ -3072,6 +3086,17 @@ static inline int l2cap_connect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hd
return 0;
}
+static inline void set_default_fcs(struct l2cap_pinfo *pi)
+{
+ /* FCS is enabled only in ERTM or streaming mode, if one or both
+ * sides request it.
+ */
+ if (pi->mode != L2CAP_MODE_ERTM && pi->mode != L2CAP_MODE_STREAMING)
+ pi->fcs = L2CAP_FCS_NONE;
+ else if (!(pi->conf_state & L2CAP_CONF_NO_FCS_RECV))
+ pi->fcs = L2CAP_FCS_CRC16;
+}
+
static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data)
{
struct l2cap_conf_req *req = (struct l2cap_conf_req *) data;
@@ -3089,14 +3114,8 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr
if (!sk)
return -ENOENT;
- if (sk->sk_state != BT_CONFIG) {
- struct l2cap_cmd_rej rej;
-
- rej.reason = cpu_to_le16(0x0002);
- l2cap_send_cmd(conn, cmd->ident, L2CAP_COMMAND_REJ,
- sizeof(rej), &rej);
+ if (sk->sk_state == BT_DISCONN)
goto unlock;
- }
/* Reject if config buffer is too small. */
len = cmd_len - sizeof(*req);
@@ -3136,9 +3155,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr
goto unlock;
if (l2cap_pi(sk)->conf_state & L2CAP_CONF_INPUT_DONE) {
- if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_NO_FCS_RECV) ||
- l2cap_pi(sk)->fcs != L2CAP_FCS_NONE)
- l2cap_pi(sk)->fcs = L2CAP_FCS_CRC16;
+ set_default_fcs(l2cap_pi(sk));
sk->sk_state = BT_CONNECTED;
@@ -3154,6 +3171,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr
if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_REQ_SENT)) {
u8 buf[64];
+ l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT;
l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
l2cap_build_conf_req(sk, buf), buf);
l2cap_pi(sk)->num_conf_req++;
@@ -3226,9 +3244,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr
l2cap_pi(sk)->conf_state |= L2CAP_CONF_INPUT_DONE;
if (l2cap_pi(sk)->conf_state & L2CAP_CONF_OUTPUT_DONE) {
- if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_NO_FCS_RECV) ||
- l2cap_pi(sk)->fcs != L2CAP_FCS_NONE)
- l2cap_pi(sk)->fcs = L2CAP_FCS_CRC16;
+ set_default_fcs(l2cap_pi(sk));
sk->sk_state = BT_CONNECTED;
l2cap_pi(sk)->next_tx_seq = 0;
@@ -4648,6 +4664,8 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl
if (flags & ACL_START) {
struct l2cap_hdr *hdr;
+ struct sock *sk;
+ u16 cid;
int len;
if (conn->rx_len) {
@@ -4658,7 +4676,8 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl
l2cap_conn_unreliable(conn, ECOMM);
}
- if (skb->len < 2) {
+ /* Start fragment always begin with Basic L2CAP header */
+ if (skb->len < L2CAP_HDR_SIZE) {
BT_ERR("Frame is too short (len %d)", skb->len);
l2cap_conn_unreliable(conn, ECOMM);
goto drop;
@@ -4666,6 +4685,7 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl
hdr = (struct l2cap_hdr *) skb->data;
len = __le16_to_cpu(hdr->len) + L2CAP_HDR_SIZE;
+ cid = __le16_to_cpu(hdr->cid);
if (len == skb->len) {
/* Complete frame received */
@@ -4682,6 +4702,19 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl
goto drop;
}
+ sk = l2cap_get_chan_by_scid(&conn->chan_list, cid);
+
+ if (sk && l2cap_pi(sk)->imtu < len - L2CAP_HDR_SIZE) {
+ BT_ERR("Frame exceeding recv MTU (len %d, MTU %d)",
+ len, l2cap_pi(sk)->imtu);
+ bh_unlock_sock(sk);
+ l2cap_conn_unreliable(conn, ECOMM);
+ goto drop;
+ }
+
+ if (sk)
+ bh_unlock_sock(sk);
+
/* Allocate skb for the complete frame (with header) */
conn->rx_skb = bt_skb_alloc(len, GFP_ATOMIC);
if (!conn->rx_skb)
diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c
index ad2af5814e4..b826d1bf10d 100644
--- a/net/bluetooth/lib.c
+++ b/net/bluetooth/lib.c
@@ -51,8 +51,8 @@ char *batostr(bdaddr_t *ba)
i ^= 1;
sprintf(str[i], "%2.2X:%2.2X:%2.2X:%2.2X:%2.2X:%2.2X",
- ba->b[0], ba->b[1], ba->b[2],
- ba->b[3], ba->b[4], ba->b[5]);
+ ba->b[5], ba->b[4], ba->b[3],
+ ba->b[2], ba->b[1], ba->b[0]);
return str[i];
}
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 7dca91bb8c5..39a5d87e33b 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -113,11 +113,10 @@ static void rfcomm_session_del(struct rfcomm_session *s);
#define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1)
#define __get_rpn_parity(line) (((line) >> 3) & 0x7)
-static inline void rfcomm_schedule(uint event)
+static inline void rfcomm_schedule(void)
{
if (!rfcomm_thread)
return;
- //set_bit(event, &rfcomm_event);
set_bit(RFCOMM_SCHED_WAKEUP, &rfcomm_event);
wake_up_process(rfcomm_thread);
}
@@ -179,13 +178,13 @@ static unsigned char rfcomm_crc_table[256] = {
/* FCS on 2 bytes */
static inline u8 __fcs(u8 *data)
{
- return (0xff - __crc(data));
+ return 0xff - __crc(data);
}
/* FCS on 3 bytes */
static inline u8 __fcs2(u8 *data)
{
- return (0xff - rfcomm_crc_table[__crc(data) ^ data[2]]);
+ return 0xff - rfcomm_crc_table[__crc(data) ^ data[2]];
}
/* Check FCS */
@@ -203,13 +202,13 @@ static inline int __check_fcs(u8 *data, int type, u8 fcs)
static void rfcomm_l2state_change(struct sock *sk)
{
BT_DBG("%p state %d", sk, sk->sk_state);
- rfcomm_schedule(RFCOMM_SCHED_STATE);
+ rfcomm_schedule();
}
static void rfcomm_l2data_ready(struct sock *sk, int bytes)
{
BT_DBG("%p bytes %d", sk, bytes);
- rfcomm_schedule(RFCOMM_SCHED_RX);
+ rfcomm_schedule();
}
static int rfcomm_l2sock_create(struct socket **sock)
@@ -255,7 +254,7 @@ static void rfcomm_session_timeout(unsigned long arg)
BT_DBG("session %p state %ld", s, s->state);
set_bit(RFCOMM_TIMED_OUT, &s->flags);
- rfcomm_schedule(RFCOMM_SCHED_TIMEO);
+ rfcomm_schedule();
}
static void rfcomm_session_set_timer(struct rfcomm_session *s, long timeout)
@@ -283,7 +282,7 @@ static void rfcomm_dlc_timeout(unsigned long arg)
set_bit(RFCOMM_TIMED_OUT, &d->flags);
rfcomm_dlc_put(d);
- rfcomm_schedule(RFCOMM_SCHED_TIMEO);
+ rfcomm_schedule();
}
static void rfcomm_dlc_set_timer(struct rfcomm_dlc *d, long timeout)
@@ -465,7 +464,7 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
case BT_CONFIG:
if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
set_bit(RFCOMM_AUTH_REJECT, &d->flags);
- rfcomm_schedule(RFCOMM_SCHED_AUTH);
+ rfcomm_schedule();
break;
}
/* Fall through */
@@ -485,7 +484,7 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
case BT_CONNECT2:
if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
set_bit(RFCOMM_AUTH_REJECT, &d->flags);
- rfcomm_schedule(RFCOMM_SCHED_AUTH);
+ rfcomm_schedule();
break;
}
/* Fall through */
@@ -533,7 +532,7 @@ int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb)
skb_queue_tail(&d->tx_queue, skb);
if (!test_bit(RFCOMM_TX_THROTTLED, &d->flags))
- rfcomm_schedule(RFCOMM_SCHED_TX);
+ rfcomm_schedule();
return len;
}
@@ -545,7 +544,7 @@ void __rfcomm_dlc_throttle(struct rfcomm_dlc *d)
d->v24_sig |= RFCOMM_V24_FC;
set_bit(RFCOMM_MSC_PENDING, &d->flags);
}
- rfcomm_schedule(RFCOMM_SCHED_TX);
+ rfcomm_schedule();
}
void __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d)
@@ -556,7 +555,7 @@ void __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d)
d->v24_sig &= ~RFCOMM_V24_FC;
set_bit(RFCOMM_MSC_PENDING, &d->flags);
}
- rfcomm_schedule(RFCOMM_SCHED_TX);
+ rfcomm_schedule();
}
/*
@@ -577,7 +576,7 @@ int rfcomm_dlc_set_modem_status(struct rfcomm_dlc *d, u8 v24_sig)
d->v24_sig = v24_sig;
if (!test_and_set_bit(RFCOMM_MSC_PENDING, &d->flags))
- rfcomm_schedule(RFCOMM_SCHED_TX);
+ rfcomm_schedule();
return 0;
}
@@ -816,7 +815,7 @@ static int rfcomm_queue_disc(struct rfcomm_dlc *d)
cmd->fcs = __fcs2((u8 *) cmd);
skb_queue_tail(&d->tx_queue, skb);
- rfcomm_schedule(RFCOMM_SCHED_TX);
+ rfcomm_schedule();
return 0;
}
@@ -1415,8 +1414,8 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
return 0;
if (len == 1) {
- /* This is a request, return default settings */
- bit_rate = RFCOMM_RPN_BR_115200;
+ /* This is a request, return default (according to ETSI TS 07.10) settings */
+ bit_rate = RFCOMM_RPN_BR_9600;
data_bits = RFCOMM_RPN_DATA_8;
stop_bits = RFCOMM_RPN_STOP_1;
parity = RFCOMM_RPN_PARITY_NONE;
@@ -1431,9 +1430,9 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_BITRATE)) {
bit_rate = rpn->bit_rate;
- if (bit_rate != RFCOMM_RPN_BR_115200) {
+ if (bit_rate > RFCOMM_RPN_BR_230400) {
BT_DBG("RPN bit rate mismatch 0x%x", bit_rate);
- bit_rate = RFCOMM_RPN_BR_115200;
+ bit_rate = RFCOMM_RPN_BR_9600;
rpn_mask ^= RFCOMM_RPN_PM_BITRATE;
}
}
@@ -1698,7 +1697,7 @@ static int rfcomm_recv_frame(struct rfcomm_session *s, struct sk_buff *skb)
break;
default:
- BT_ERR("Unknown packet type 0x%02x\n", type);
+ BT_ERR("Unknown packet type 0x%02x", type);
break;
}
kfree_skb(skb);
@@ -1884,7 +1883,7 @@ static inline void rfcomm_accept_connection(struct rfcomm_session *s)
* L2CAP MTU minus UIH header and FCS. */
s->mtu = min(l2cap_pi(nsock->sk)->omtu, l2cap_pi(nsock->sk)->imtu) - 5;
- rfcomm_schedule(RFCOMM_SCHED_RX);
+ rfcomm_schedule();
} else
sock_release(nsock);
}
@@ -2093,7 +2092,7 @@ static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
rfcomm_session_put(s);
- rfcomm_schedule(RFCOMM_SCHED_AUTH);
+ rfcomm_schedule();
}
static struct hci_cb rfcomm_cb = {
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 44a62327595..aec505f934d 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -82,11 +82,14 @@ static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb)
static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err)
{
struct sock *sk = d->owner, *parent;
+ unsigned long flags;
+
if (!sk)
return;
BT_DBG("dlc %p state %ld err %d", d, d->state, err);
+ local_irq_save(flags);
bh_lock_sock(sk);
if (err)
@@ -108,6 +111,7 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err)
}
bh_unlock_sock(sk);
+ local_irq_restore(flags);
if (parent && sock_flag(sk, SOCK_ZAPPED)) {
/* We have to drop DLC lock here, otherwise
@@ -617,121 +621,29 @@ static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
return sent;
}
-static long rfcomm_sock_data_wait(struct sock *sk, long timeo)
-{
- DECLARE_WAITQUEUE(wait, current);
-
- add_wait_queue(sk_sleep(sk), &wait);
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (!skb_queue_empty(&sk->sk_receive_queue) ||
- sk->sk_err ||
- (sk->sk_shutdown & RCV_SHUTDOWN) ||
- signal_pending(current) ||
- !timeo)
- break;
-
- set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
- release_sock(sk);
- timeo = schedule_timeout(timeo);
- lock_sock(sk);
- clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
- }
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(sk_sleep(sk), &wait);
- return timeo;
-}
-
static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg, size_t size, int flags)
{
struct sock *sk = sock->sk;
struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc;
- int err = 0;
- size_t target, copied = 0;
- long timeo;
+ int len;
if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
rfcomm_dlc_accept(d);
return 0;
}
- if (flags & MSG_OOB)
- return -EOPNOTSUPP;
-
- msg->msg_namelen = 0;
-
- BT_DBG("sk %p size %zu", sk, size);
+ len = bt_sock_stream_recvmsg(iocb, sock, msg, size, flags);
lock_sock(sk);
+ if (!(flags & MSG_PEEK) && len > 0)
+ atomic_sub(len, &sk->sk_rmem_alloc);
- target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
- timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
-
- do {
- struct sk_buff *skb;
- int chunk;
-
- skb = skb_dequeue(&sk->sk_receive_queue);
- if (!skb) {
- if (copied >= target)
- break;
-
- if ((err = sock_error(sk)) != 0)
- break;
- if (sk->sk_shutdown & RCV_SHUTDOWN)
- break;
-
- err = -EAGAIN;
- if (!timeo)
- break;
-
- timeo = rfcomm_sock_data_wait(sk, timeo);
-
- if (signal_pending(current)) {
- err = sock_intr_errno(timeo);
- goto out;
- }
- continue;
- }
-
- chunk = min_t(unsigned int, skb->len, size);
- if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
- skb_queue_head(&sk->sk_receive_queue, skb);
- if (!copied)
- copied = -EFAULT;
- break;
- }
- copied += chunk;
- size -= chunk;
-
- sock_recv_ts_and_drops(msg, sk, skb);
-
- if (!(flags & MSG_PEEK)) {
- atomic_sub(chunk, &sk->sk_rmem_alloc);
-
- skb_pull(skb, chunk);
- if (skb->len) {
- skb_queue_head(&sk->sk_receive_queue, skb);
- break;
- }
- kfree_skb(skb);
-
- } else {
- /* put message back and return */
- skb_queue_head(&sk->sk_receive_queue, skb);
- break;
- }
- } while (size);
-
-out:
if (atomic_read(&sk->sk_rmem_alloc) <= (sk->sk_rcvbuf >> 2))
rfcomm_dlc_unthrottle(rfcomm_pi(sk)->dlc);
-
release_sock(sk);
- return copied ? : err;
+
+ return len;
}
static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, unsigned int optlen)
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index befc3a52aa0..a9b81f5dacd 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -183,9 +183,7 @@ static struct device *rfcomm_get_device(struct rfcomm_dev *dev)
static ssize_t show_address(struct device *tty_dev, struct device_attribute *attr, char *buf)
{
struct rfcomm_dev *dev = dev_get_drvdata(tty_dev);
- bdaddr_t bdaddr;
- baswap(&bdaddr, &dev->dst);
- return sprintf(buf, "%s\n", batostr(&bdaddr));
+ return sprintf(buf, "%s\n", batostr(&dev->dst));
}
static ssize_t show_channel(struct device *tty_dev, struct device_attribute *attr, char *buf)
@@ -844,10 +842,6 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, struct file *filp, unsigned
BT_DBG("TIOCMIWAIT");
break;
- case TIOCGICOUNT:
- BT_DBG("TIOCGICOUNT");
- break;
-
case TIOCGSERIAL:
BT_ERR("TIOCGSERIAL is not supported");
return -ENOIOCTLCMD;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index cf09fe591fc..17cb0b63357 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -212,6 +212,11 @@ static int br_set_tx_csum(struct net_device *dev, u32 data)
return 0;
}
+static int br_set_flags(struct net_device *netdev, u32 data)
+{
+ return ethtool_op_set_flags(netdev, data, ETH_FLAG_TXVLAN);
+}
+
#ifdef CONFIG_NET_POLL_CONTROLLER
static void br_poll_controller(struct net_device *br_dev)
{
@@ -304,6 +309,7 @@ static const struct ethtool_ops br_ethtool_ops = {
.get_ufo = ethtool_op_get_ufo,
.set_ufo = ethtool_op_set_ufo,
.get_flags = ethtool_op_get_flags,
+ .set_flags = br_set_flags,
};
static const struct net_device_ops br_netdev_ops = {
@@ -343,5 +349,5 @@ void br_dev_setup(struct net_device *dev)
dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_LLTX |
- NETIF_F_NETNS_LOCAL | NETIF_F_GSO;
+ NETIF_F_NETNS_LOCAL | NETIF_F_GSO | NETIF_F_HW_VLAN_TX;
}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index c03d2c3ff03..89ad25a7620 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -61,30 +61,27 @@ static int port_cost(struct net_device *dev)
}
-/*
- * Check for port carrier transistions.
- * Called from work queue to allow for calling functions that
- * might sleep (such as speed check), and to debounce.
- */
+/* Check for port carrier transistions. */
void br_port_carrier_check(struct net_bridge_port *p)
{
struct net_device *dev = p->dev;
struct net_bridge *br = p->br;
- if (netif_carrier_ok(dev))
+ if (netif_running(dev) && netif_carrier_ok(dev))
p->path_cost = port_cost(dev);
- if (netif_running(br->dev)) {
- spin_lock_bh(&br->lock);
- if (netif_carrier_ok(dev)) {
- if (p->state == BR_STATE_DISABLED)
- br_stp_enable_port(p);
- } else {
- if (p->state != BR_STATE_DISABLED)
- br_stp_disable_port(p);
- }
- spin_unlock_bh(&br->lock);
+ if (!netif_running(br->dev))
+ return;
+
+ spin_lock_bh(&br->lock);
+ if (netif_running(dev) && netif_carrier_ok(dev)) {
+ if (p->state == BR_STATE_DISABLED)
+ br_stp_enable_port(p);
+ } else {
+ if (p->state != BR_STATE_DISABLED)
+ br_stp_disable_port(p);
}
+ spin_unlock_bh(&br->lock);
}
static void release_nbp(struct kobject *kobj)
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 826cd522153..25207a1f182 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -141,7 +141,7 @@ struct sk_buff *br_handle_frame(struct sk_buff *skb)
const unsigned char *dest = eth_hdr(skb)->h_dest;
int (*rhook)(struct sk_buff *skb);
- if (skb->pkt_type == PACKET_LOOPBACK)
+ if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
return skb;
if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
@@ -159,7 +159,7 @@ struct sk_buff *br_handle_frame(struct sk_buff *skb)
goto drop;
/* If STP is turned off, then forward */
- if (p->br->stp_enabled == BR_NO_STP && dest[5] == 0)
+ if (p->br->stp_enabled == BR_NO_STP)
goto forward;
if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 2c911c0759c..865fd7634b6 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -64,22 +64,24 @@ static int brnf_filter_pppoe_tagged __read_mostly = 0;
static inline __be16 vlan_proto(const struct sk_buff *skb)
{
- return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
+ if (vlan_tx_tag_present(skb))
+ return skb->protocol;
+ else if (skb->protocol == htons(ETH_P_8021Q))
+ return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
+ else
+ return 0;
}
#define IS_VLAN_IP(skb) \
- (skb->protocol == htons(ETH_P_8021Q) && \
- vlan_proto(skb) == htons(ETH_P_IP) && \
+ (vlan_proto(skb) == htons(ETH_P_IP) && \
brnf_filter_vlan_tagged)
#define IS_VLAN_IPV6(skb) \
- (skb->protocol == htons(ETH_P_8021Q) && \
- vlan_proto(skb) == htons(ETH_P_IPV6) &&\
+ (vlan_proto(skb) == htons(ETH_P_IPV6) && \
brnf_filter_vlan_tagged)
#define IS_VLAN_ARP(skb) \
- (skb->protocol == htons(ETH_P_8021Q) && \
- vlan_proto(skb) == htons(ETH_P_ARP) && \
+ (vlan_proto(skb) == htons(ETH_P_ARP) && \
brnf_filter_vlan_tagged)
static inline __be16 pppoe_proto(const struct sk_buff *skb)
@@ -106,7 +108,6 @@ static struct dst_ops fake_dst_ops = {
.family = AF_INET,
.protocol = cpu_to_be16(ETH_P_IP),
.update_pmtu = fake_update_pmtu,
- .entries = ATOMIC_INIT(0),
};
/*
@@ -162,8 +163,8 @@ static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb)
if (tmp) {
memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info));
atomic_set(&tmp->use, 1);
- nf_bridge_put(nf_bridge);
}
+ nf_bridge_put(nf_bridge);
nf_bridge = tmp;
}
return nf_bridge;
@@ -209,6 +210,72 @@ static inline void nf_bridge_update_protocol(struct sk_buff *skb)
skb->protocol = htons(ETH_P_PPP_SES);
}
+/* When handing a packet over to the IP layer
+ * check whether we have a skb that is in the
+ * expected format
+ */
+
+static int br_parse_ip_options(struct sk_buff *skb)
+{
+ struct ip_options *opt;
+ struct iphdr *iph;
+ struct net_device *dev = skb->dev;
+ u32 len;
+
+ iph = ip_hdr(skb);
+ opt = &(IPCB(skb)->opt);
+
+ /* Basic sanity checks */
+ if (iph->ihl < 5 || iph->version != 4)
+ goto inhdr_error;
+
+ if (!pskb_may_pull(skb, iph->ihl*4))
+ goto inhdr_error;
+
+ iph = ip_hdr(skb);
+ if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+ goto inhdr_error;
+
+ len = ntohs(iph->tot_len);
+ if (skb->len < len) {
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
+ goto drop;
+ } else if (len < (iph->ihl*4))
+ goto inhdr_error;
+
+ if (pskb_trim_rcsum(skb, len)) {
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+
+ /* Zero out the CB buffer if no options present */
+ if (iph->ihl == 5) {
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ return 0;
+ }
+
+ opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
+ if (ip_options_compile(dev_net(dev), opt, skb))
+ goto inhdr_error;
+
+ /* Check correct handling of SRR option */
+ if (unlikely(opt->srr)) {
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ if (in_dev && !IN_DEV_SOURCE_ROUTE(in_dev))
+ goto drop;
+
+ if (ip_options_rcv_srr(skb))
+ goto drop;
+ }
+
+ return 0;
+
+inhdr_error:
+ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+drop:
+ return -1;
+}
+
/* Fill in the header for fragmented IP packets handled by
* the IPv4 connection tracking code.
*/
@@ -549,7 +616,6 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
{
struct net_bridge_port *p;
struct net_bridge *br;
- struct iphdr *iph;
__u32 len = nf_bridge_encap_header_len(skb);
if (unlikely(!pskb_may_pull(skb, len)))
@@ -578,28 +644,9 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
nf_bridge_pull_encap_header_rcsum(skb);
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- goto inhdr_error;
-
- iph = ip_hdr(skb);
- if (iph->ihl < 5 || iph->version != 4)
- goto inhdr_error;
-
- if (!pskb_may_pull(skb, 4 * iph->ihl))
- goto inhdr_error;
-
- iph = ip_hdr(skb);
- if (ip_fast_csum((__u8 *) iph, iph->ihl) != 0)
- goto inhdr_error;
-
- len = ntohs(iph->tot_len);
- if (skb->len < len || len < 4 * iph->ihl)
- goto inhdr_error;
-
- pskb_trim_rcsum(skb, len);
-
- /* BUG: Should really parse the IP options here. */
- memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ if (br_parse_ip_options(skb))
+ /* Drop invalid packet */
+ goto out;
nf_bridge_put(skb->nf_bridge);
if (!nf_bridge_alloc(skb))
@@ -614,8 +661,6 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
return NF_STOLEN;
-inhdr_error:
-// IP_INC_STATS_BH(IpInHdrErrors);
out:
return NF_DROP;
}
@@ -759,12 +804,19 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff *skb,
#if defined(CONFIG_NF_CONNTRACK_IPV4) || defined(CONFIG_NF_CONNTRACK_IPV4_MODULE)
static int br_nf_dev_queue_xmit(struct sk_buff *skb)
{
+ int ret;
+
if (skb->nfct != NULL && skb->protocol == htons(ETH_P_IP) &&
skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
- !skb_is_gso(skb))
- return ip_fragment(skb, br_dev_queue_push_xmit);
- else
- return br_dev_queue_push_xmit(skb);
+ !skb_is_gso(skb)) {
+ if (br_parse_ip_options(skb))
+ /* Drop invalid packet */
+ return NF_DROP;
+ ret = ip_fragment(skb, br_dev_queue_push_xmit);
+ } else
+ ret = br_dev_queue_push_xmit(skb);
+
+ return ret;
}
#else
static int br_nf_dev_queue_xmit(struct sk_buff *skb)
@@ -952,15 +1004,22 @@ int __init br_netfilter_init(void)
{
int ret;
- ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ ret = dst_entries_init(&fake_dst_ops);
if (ret < 0)
return ret;
+
+ ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ if (ret < 0) {
+ dst_entries_destroy(&fake_dst_ops);
+ return ret;
+ }
#ifdef CONFIG_SYSCTL
brnf_sysctl_header = register_sysctl_paths(brnf_path, brnf_table);
if (brnf_sysctl_header == NULL) {
printk(KERN_WARNING
"br_netfilter: can't register to sysctl.\n");
nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ dst_entries_destroy(&fake_dst_ops);
return -ENOMEM;
}
#endif
@@ -974,4 +1033,5 @@ void br_netfilter_fini(void)
#ifdef CONFIG_SYSCTL
unregister_sysctl_table(brnf_sysctl_header);
#endif
+ dst_entries_destroy(&fake_dst_ops);
}
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
index 87b53b3a921..eae67bf0446 100644
--- a/net/bridge/netfilter/ebt_vlan.c
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -39,8 +39,6 @@ static bool
ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct ebt_vlan_info *info = par->matchinfo;
- const struct vlan_hdr *fp;
- struct vlan_hdr _frame;
unsigned short TCI; /* Whole TCI, given from parsed frame */
unsigned short id; /* VLAN ID, given from frame TCI */
@@ -48,9 +46,20 @@ ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par)
/* VLAN encapsulated Type/Length field, given from orig frame */
__be16 encap;
- fp = skb_header_pointer(skb, 0, sizeof(_frame), &_frame);
- if (fp == NULL)
- return false;
+ if (vlan_tx_tag_present(skb)) {
+ TCI = vlan_tx_tag_get(skb);
+ encap = skb->protocol;
+ } else {
+ const struct vlan_hdr *fp;
+ struct vlan_hdr _frame;
+
+ fp = skb_header_pointer(skb, 0, sizeof(_frame), &_frame);
+ if (fp == NULL)
+ return false;
+
+ TCI = ntohs(fp->h_vlan_TCI);
+ encap = fp->h_vlan_encapsulated_proto;
+ }
/* Tag Control Information (TCI) consists of the following elements:
* - User_priority. The user_priority field is three bits in length,
@@ -59,10 +68,8 @@ ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par)
* (CFI) is a single bit flag value. Currently ignored.
* - VLAN Identifier (VID). The VID is encoded as
* an unsigned binary number. */
- TCI = ntohs(fp->h_vlan_TCI);
id = TCI & VLAN_VID_MASK;
prio = (TCI >> 13) & 0x7;
- encap = fp->h_vlan_encapsulated_proto;
/* Checking VLAN Identifier (VID) */
if (GET_BITMASK(EBT_VLAN_ID))
@@ -111,10 +118,10 @@ static int ebt_vlan_mt_check(const struct xt_mtchk_param *par)
* 0 - The null VLAN ID.
* 1 - The default Port VID (PVID)
* 0x0FFF - Reserved for implementation use.
- * if_vlan.h: VLAN_GROUP_ARRAY_LEN 4096. */
+ * if_vlan.h: VLAN_N_VID 4096. */
if (GET_BITMASK(EBT_VLAN_ID)) {
if (!!info->id) { /* if id!=0 => check vid range */
- if (info->id > VLAN_GROUP_ARRAY_LEN) {
+ if (info->id > VLAN_N_VID) {
pr_debug("id %d is out of range (1-4096)\n",
info->id);
return -EINVAL;
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index bcc102e3be4..a1dcf83f0d5 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -124,16 +124,23 @@ ebt_dev_check(const char *entry, const struct net_device *device)
#define FWINV2(bool,invflg) ((bool) ^ !!(e->invflags & invflg))
/* process standard matches */
static inline int
-ebt_basic_match(const struct ebt_entry *e, const struct ethhdr *h,
+ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
const struct net_device *in, const struct net_device *out)
{
+ const struct ethhdr *h = eth_hdr(skb);
+ __be16 ethproto;
int verdict, i;
+ if (vlan_tx_tag_present(skb))
+ ethproto = htons(ETH_P_8021Q);
+ else
+ ethproto = h->h_proto;
+
if (e->bitmask & EBT_802_3) {
- if (FWINV2(ntohs(h->h_proto) >= 1536, EBT_IPROTO))
+ if (FWINV2(ntohs(ethproto) >= 1536, EBT_IPROTO))
return 1;
} else if (!(e->bitmask & EBT_NOPROTO) &&
- FWINV2(e->ethproto != h->h_proto, EBT_IPROTO))
+ FWINV2(e->ethproto != ethproto, EBT_IPROTO))
return 1;
if (FWINV2(ebt_dev_check(e->in, in), EBT_IIN))
@@ -213,7 +220,7 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
base = private->entries;
i = 0;
while (i < nentries) {
- if (ebt_basic_match(point, eth_hdr(skb), in, out))
+ if (ebt_basic_match(point, skb, in, out))
goto letscontinue;
if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, &acpar) != 0)
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index 0b586e9d137..b99369a055d 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -9,6 +9,8 @@
* and Sakari Ailus <sakari.ailus@nokia.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
#include <linux/version.h>
#include <linux/module.h>
#include <linux/kernel.h>
@@ -171,7 +173,7 @@ static int receive(struct sk_buff *skb, struct net_device *dev,
net = dev_net(dev);
pkt = cfpkt_fromnative(CAIF_DIR_IN, skb);
caifd = caif_get(dev);
- if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd)
+ if (!caifd || !caifd->layer.up || !caifd->layer.up->receive)
return NET_RX_DROP;
if (caifd->layer.up->receive(caifd->layer.up, pkt))
@@ -214,7 +216,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
switch (what) {
case NETDEV_REGISTER:
- pr_info("CAIF: %s():register %s\n", __func__, dev->name);
+ netdev_info(dev, "register\n");
caifd = caif_device_alloc(dev);
if (caifd == NULL)
break;
@@ -225,14 +227,13 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
break;
case NETDEV_UP:
- pr_info("CAIF: %s(): up %s\n", __func__, dev->name);
+ netdev_info(dev, "up\n");
caifd = caif_get(dev);
if (caifd == NULL)
break;
caifdev = netdev_priv(dev);
if (atomic_read(&caifd->state) == NETDEV_UP) {
- pr_info("CAIF: %s():%s already up\n",
- __func__, dev->name);
+ netdev_info(dev, "already up\n");
break;
}
atomic_set(&caifd->state, what);
@@ -273,7 +274,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
caifd = caif_get(dev);
if (caifd == NULL)
break;
- pr_info("CAIF: %s():going down %s\n", __func__, dev->name);
+ netdev_info(dev, "going down\n");
if (atomic_read(&caifd->state) == NETDEV_GOING_DOWN ||
atomic_read(&caifd->state) == NETDEV_DOWN)
@@ -295,11 +296,10 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
caifd = caif_get(dev);
if (caifd == NULL)
break;
- pr_info("CAIF: %s(): down %s\n", __func__, dev->name);
+ netdev_info(dev, "down\n");
if (atomic_read(&caifd->in_use))
- pr_warning("CAIF: %s(): "
- "Unregistering an active CAIF device: %s\n",
- __func__, dev->name);
+ netdev_warn(dev,
+ "Unregistering an active CAIF device\n");
cfcnfg_del_phy_layer(get_caif_conf(), &caifd->layer);
dev_put(dev);
atomic_set(&caifd->state, what);
@@ -307,7 +307,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
case NETDEV_UNREGISTER:
caifd = caif_get(dev);
- pr_info("CAIF: %s(): unregister %s\n", __func__, dev->name);
+ netdev_info(dev, "unregister\n");
atomic_set(&caifd->state, what);
caif_device_destroy(dev);
break;
@@ -391,7 +391,7 @@ static int __init caif_device_init(void)
int result;
cfg = cfcnfg_create();
if (!cfg) {
- pr_warning("CAIF: %s(): can't create cfcnfg.\n", __func__);
+ pr_warn("can't create cfcnfg\n");
goto err_cfcnfg_create_failed;
}
result = register_pernet_device(&caif_net_ops);
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 8ce90478611..2eca2dd0000 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -4,6 +4,8 @@
* License terms: GNU General Public License (GPL) version 2
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -15,7 +17,6 @@
#include <linux/poll.h>
#include <linux/tcp.h>
#include <linux/uaccess.h>
-#include <linux/mutex.h>
#include <linux/debugfs.h>
#include <linux/caif/caif_socket.h>
#include <asm/atomic.h>
@@ -28,9 +29,6 @@
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(AF_CAIF);
-#define CAIF_DEF_SNDBUF (4096*10)
-#define CAIF_DEF_RCVBUF (4096*100)
-
/*
* CAIF state is re-using the TCP socket states.
* caif_states stored in sk_state reflect the state as reported by
@@ -157,9 +155,7 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
(unsigned)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) {
- trace_printk("CAIF: %s():"
- " sending flow OFF (queue len = %d %d)\n",
- __func__,
+ pr_debug("sending flow OFF (queue len = %d %d)\n",
atomic_read(&cf_sk->sk.sk_rmem_alloc),
sk_rcvbuf_lowwater(cf_sk));
set_rx_flow_off(cf_sk);
@@ -172,9 +168,7 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return err;
if (!sk_rmem_schedule(sk, skb->truesize) && rx_flow_is_on(cf_sk)) {
set_rx_flow_off(cf_sk);
- trace_printk("CAIF: %s():"
- " sending flow OFF due to rmem_schedule\n",
- __func__);
+ pr_debug("sending flow OFF due to rmem_schedule\n");
dbfs_atomic_inc(&cnt.num_rx_flow_off);
caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ);
}
@@ -275,8 +269,7 @@ static void caif_ctrl_cb(struct cflayer *layr,
break;
default:
- pr_debug("CAIF: %s(): Unexpected flow command %d\n",
- __func__, flow);
+ pr_debug("Unexpected flow command %d\n", flow);
}
}
@@ -536,8 +529,7 @@ static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk,
/* Slight paranoia, probably not needed. */
if (unlikely(loopcnt++ > 1000)) {
- pr_warning("CAIF: %s(): transmit retries failed,"
- " error = %d\n", __func__, ret);
+ pr_warn("transmit retries failed, error = %d\n", ret);
break;
}
@@ -827,6 +819,7 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr,
long timeo;
int err;
int ifindex, headroom, tailroom;
+ unsigned int mtu;
struct net_device *dev;
lock_sock(sk);
@@ -896,15 +889,22 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr,
cf_sk->sk.sk_state = CAIF_DISCONNECTED;
goto out;
}
- dev = dev_get_by_index(sock_net(sk), ifindex);
+
+ err = -ENODEV;
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
+ if (!dev) {
+ rcu_read_unlock();
+ goto out;
+ }
cf_sk->headroom = LL_RESERVED_SPACE_EXTRA(dev, headroom);
+ mtu = dev->mtu;
+ rcu_read_unlock();
+
cf_sk->tailroom = tailroom;
- cf_sk->maxframe = dev->mtu - (headroom + tailroom);
- dev_put(dev);
+ cf_sk->maxframe = mtu - (headroom + tailroom);
if (cf_sk->maxframe < 1) {
- pr_warning("CAIF: %s(): CAIF Interface MTU too small (%d)\n",
- __func__, dev->mtu);
- err = -ENODEV;
+ pr_warn("CAIF Interface MTU too small (%d)\n", dev->mtu);
goto out;
}
@@ -1123,10 +1123,6 @@ static int caif_create(struct net *net, struct socket *sock, int protocol,
/* Store the protocol */
sk->sk_protocol = (unsigned char) protocol;
- /* Sendbuf dictates the amount of outbound packets not yet sent */
- sk->sk_sndbuf = CAIF_DEF_SNDBUF;
- sk->sk_rcvbuf = CAIF_DEF_RCVBUF;
-
/*
* Lock in order to try to stop someone from opening the socket
* too early.
diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c
index 1c29189b344..41adafd1891 100644
--- a/net/caif/cfcnfg.c
+++ b/net/caif/cfcnfg.c
@@ -3,6 +3,9 @@
* Author: Sjur Brendeland/sjur.brandeland@stericsson.com
* License terms: GNU General Public License (GPL) version 2
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
#include <linux/kernel.h>
#include <linux/stddef.h>
#include <linux/slab.h>
@@ -78,7 +81,7 @@ struct cfcnfg *cfcnfg_create(void)
/* Initiate this layer */
this = kzalloc(sizeof(struct cfcnfg), GFP_ATOMIC);
if (!this) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return NULL;
}
this->mux = cfmuxl_create();
@@ -106,7 +109,7 @@ struct cfcnfg *cfcnfg_create(void)
layer_set_up(this->ctrl, this);
return this;
out_of_mem:
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
kfree(this->mux);
kfree(this->ctrl);
kfree(this);
@@ -194,7 +197,7 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer)
caif_assert(adap_layer != NULL);
channel_id = adap_layer->id;
if (adap_layer->dn == NULL || channel_id == 0) {
- pr_err("CAIF: %s():adap_layer->id is 0\n", __func__);
+ pr_err("adap_layer->dn == NULL or adap_layer->id is 0\n");
ret = -ENOTCONN;
goto end;
}
@@ -204,9 +207,8 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer)
layer_set_up(servl, NULL);
ret = cfctrl_linkdown_req(cnfg->ctrl, channel_id, adap_layer);
if (servl == NULL) {
- pr_err("CAIF: %s(): PROTOCOL ERROR "
- "- Error removing service_layer Channel_Id(%d)",
- __func__, channel_id);
+ pr_err("PROTOCOL ERROR - Error removing service_layer Channel_Id(%d)",
+ channel_id);
ret = -EINVAL;
goto end;
}
@@ -216,18 +218,14 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer)
phyinfo = cfcnfg_get_phyinfo(cnfg, phyid);
if (phyinfo == NULL) {
- pr_warning("CAIF: %s(): "
- "No interface to send disconnect to\n",
- __func__);
+ pr_warn("No interface to send disconnect to\n");
ret = -ENODEV;
goto end;
}
if (phyinfo->id != phyid ||
phyinfo->phy_layer->id != phyid ||
phyinfo->frm_layer->id != phyid) {
- pr_err("CAIF: %s(): "
- "Inconsistency in phy registration\n",
- __func__);
+ pr_err("Inconsistency in phy registration\n");
ret = -EINVAL;
goto end;
}
@@ -276,21 +274,20 @@ int cfcnfg_add_adaptation_layer(struct cfcnfg *cnfg,
{
struct cflayer *frml;
if (adap_layer == NULL) {
- pr_err("CAIF: %s(): adap_layer is zero", __func__);
+ pr_err("adap_layer is zero\n");
return -EINVAL;
}
if (adap_layer->receive == NULL) {
- pr_err("CAIF: %s(): adap_layer->receive is NULL", __func__);
+ pr_err("adap_layer->receive is NULL\n");
return -EINVAL;
}
if (adap_layer->ctrlcmd == NULL) {
- pr_err("CAIF: %s(): adap_layer->ctrlcmd == NULL", __func__);
+ pr_err("adap_layer->ctrlcmd == NULL\n");
return -EINVAL;
}
frml = cnfg->phy_layers[param->phyid].frm_layer;
if (frml == NULL) {
- pr_err("CAIF: %s(): Specified PHY type does not exist!",
- __func__);
+ pr_err("Specified PHY type does not exist!\n");
return -ENODEV;
}
caif_assert(param->phyid == cnfg->phy_layers[param->phyid].id);
@@ -330,9 +327,7 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
struct net_device *netdev;
if (adapt_layer == NULL) {
- pr_debug("CAIF: %s(): link setup response "
- "but no client exist, send linkdown back\n",
- __func__);
+ pr_debug("link setup response but no client exist, send linkdown back\n");
cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL);
return;
}
@@ -374,13 +369,11 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
servicel = cfdbgl_create(channel_id, &phyinfo->dev_info);
break;
default:
- pr_err("CAIF: %s(): Protocol error. "
- "Link setup response - unknown channel type\n",
- __func__);
+ pr_err("Protocol error. Link setup response - unknown channel type\n");
return;
}
if (!servicel) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return;
}
layer_set_dn(servicel, cnfg->mux);
@@ -418,7 +411,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type,
}
}
if (*phyid == 0) {
- pr_err("CAIF: %s(): No Available PHY ID\n", __func__);
+ pr_err("No Available PHY ID\n");
return;
}
@@ -427,7 +420,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type,
phy_driver =
cfserl_create(CFPHYTYPE_FRAG, *phyid, stx);
if (!phy_driver) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return;
}
@@ -436,7 +429,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type,
phy_driver = NULL;
break;
default:
- pr_err("CAIF: %s(): %d", __func__, phy_type);
+ pr_err("%d\n", phy_type);
return;
break;
}
@@ -455,7 +448,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type,
phy_layer->type = phy_type;
frml = cffrml_create(*phyid, fcs);
if (!frml) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return;
}
cnfg->phy_layers[*phyid].frm_layer = frml;
diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c
index 563145fdc4c..08f267a109a 100644
--- a/net/caif/cfctrl.c
+++ b/net/caif/cfctrl.c
@@ -4,6 +4,8 @@
* License terms: GNU General Public License (GPL) version 2
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
#include <linux/stddef.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
@@ -36,7 +38,7 @@ struct cflayer *cfctrl_create(void)
struct cfctrl *this =
kmalloc(sizeof(struct cfctrl), GFP_ATOMIC);
if (!this) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return NULL;
}
caif_assert(offsetof(struct cfctrl, serv.layer) == 0);
@@ -132,9 +134,7 @@ struct cfctrl_request_info *cfctrl_remove_req(struct cfctrl *ctrl,
list_for_each_entry_safe(p, tmp, &ctrl->list, list) {
if (cfctrl_req_eq(req, p)) {
if (p != first)
- pr_warning("CAIF: %s(): Requests are not "
- "received in order\n",
- __func__);
+ pr_warn("Requests are not received in order\n");
atomic_set(&ctrl->rsp_seq_no,
p->sequence_no);
@@ -177,7 +177,7 @@ void cfctrl_enum_req(struct cflayer *layer, u8 physlinkid)
int ret;
struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
if (!pkt) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return;
}
caif_assert(offsetof(struct cfctrl, serv.layer) == 0);
@@ -189,8 +189,7 @@ void cfctrl_enum_req(struct cflayer *layer, u8 physlinkid)
ret =
cfctrl->serv.layer.dn->transmit(cfctrl->serv.layer.dn, pkt);
if (ret < 0) {
- pr_err("CAIF: %s(): Could not transmit enum message\n",
- __func__);
+ pr_err("Could not transmit enum message\n");
cfpkt_destroy(pkt);
}
}
@@ -208,7 +207,7 @@ int cfctrl_linkup_request(struct cflayer *layer,
char utility_name[16];
struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
if (!pkt) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return -ENOMEM;
}
cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_SETUP);
@@ -253,13 +252,13 @@ int cfctrl_linkup_request(struct cflayer *layer,
param->u.utility.paramlen);
break;
default:
- pr_warning("CAIF: %s():Request setup of bad link type = %d\n",
- __func__, param->linktype);
+ pr_warn("Request setup of bad link type = %d\n",
+ param->linktype);
return -EINVAL;
}
req = kzalloc(sizeof(*req), GFP_KERNEL);
if (!req) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return -ENOMEM;
}
req->client_layer = user_layer;
@@ -276,8 +275,7 @@ int cfctrl_linkup_request(struct cflayer *layer,
ret =
cfctrl->serv.layer.dn->transmit(cfctrl->serv.layer.dn, pkt);
if (ret < 0) {
- pr_err("CAIF: %s(): Could not transmit linksetup request\n",
- __func__);
+ pr_err("Could not transmit linksetup request\n");
cfpkt_destroy(pkt);
return -ENODEV;
}
@@ -291,7 +289,7 @@ int cfctrl_linkdown_req(struct cflayer *layer, u8 channelid,
struct cfctrl *cfctrl = container_obj(layer);
struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
if (!pkt) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return -ENOMEM;
}
cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_DESTROY);
@@ -300,8 +298,7 @@ int cfctrl_linkdown_req(struct cflayer *layer, u8 channelid,
ret =
cfctrl->serv.layer.dn->transmit(cfctrl->serv.layer.dn, pkt);
if (ret < 0) {
- pr_err("CAIF: %s(): Could not transmit link-down request\n",
- __func__);
+ pr_err("Could not transmit link-down request\n");
cfpkt_destroy(pkt);
}
return ret;
@@ -313,7 +310,7 @@ void cfctrl_sleep_req(struct cflayer *layer)
struct cfctrl *cfctrl = container_obj(layer);
struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
if (!pkt) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return;
}
cfpkt_addbdy(pkt, CFCTRL_CMD_SLEEP);
@@ -330,7 +327,7 @@ void cfctrl_wake_req(struct cflayer *layer)
struct cfctrl *cfctrl = container_obj(layer);
struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
if (!pkt) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return;
}
cfpkt_addbdy(pkt, CFCTRL_CMD_WAKE);
@@ -347,7 +344,7 @@ void cfctrl_getstartreason_req(struct cflayer *layer)
struct cfctrl *cfctrl = container_obj(layer);
struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
if (!pkt) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return;
}
cfpkt_addbdy(pkt, CFCTRL_CMD_START_REASON);
@@ -364,12 +361,11 @@ void cfctrl_cancel_req(struct cflayer *layr, struct cflayer *adap_layer)
struct cfctrl_request_info *p, *tmp;
struct cfctrl *ctrl = container_obj(layr);
spin_lock(&ctrl->info_list_lock);
- pr_warning("CAIF: %s(): enter\n", __func__);
+ pr_warn("enter\n");
list_for_each_entry_safe(p, tmp, &ctrl->list, list) {
if (p->client_layer == adap_layer) {
- pr_warning("CAIF: %s(): cancel req :%d\n", __func__,
- p->sequence_no);
+ pr_warn("cancel req :%d\n", p->sequence_no);
list_del(&p->list);
kfree(p);
}
@@ -520,9 +516,8 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
cfpkt_extr_head(pkt, &param, len);
break;
default:
- pr_warning("CAIF: %s(): Request setup "
- "- invalid link type (%d)",
- __func__, serv);
+ pr_warn("Request setup - invalid link type (%d)\n",
+ serv);
goto error;
}
@@ -532,9 +527,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
if (CFCTRL_ERR_BIT == (CFCTRL_ERR_BIT & cmdrsp) ||
cfpkt_erroneous(pkt)) {
- pr_err("CAIF: %s(): Invalid O/E bit or parse "
- "error on CAIF control channel",
- __func__);
+ pr_err("Invalid O/E bit or parse error on CAIF control channel\n");
cfctrl->res.reject_rsp(cfctrl->serv.layer.up,
0,
req ? req->client_layer
@@ -556,8 +549,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
cfctrl->res.linkdestroy_rsp(cfctrl->serv.layer.up, linkid);
break;
case CFCTRL_CMD_LINK_ERR:
- pr_err("CAIF: %s(): Frame Error Indication received\n",
- __func__);
+ pr_err("Frame Error Indication received\n");
cfctrl->res.linkerror_ind();
break;
case CFCTRL_CMD_ENUM:
@@ -576,7 +568,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
cfctrl->res.radioset_rsp();
break;
default:
- pr_err("CAIF: %s(): Unrecognized Control Frame\n", __func__);
+ pr_err("Unrecognized Control Frame\n");
goto error;
break;
}
@@ -595,8 +587,7 @@ static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
case CAIF_CTRLCMD_FLOW_OFF_IND:
spin_lock(&this->info_list_lock);
if (!list_empty(&this->list)) {
- pr_debug("CAIF: %s(): Received flow off in "
- "control layer", __func__);
+ pr_debug("Received flow off in control layer\n");
}
spin_unlock(&this->info_list_lock);
break;
@@ -620,7 +611,7 @@ static int handle_loop(struct cfctrl *ctrl, int cmd, struct cfpkt *pkt)
if (!ctrl->loop_linkused[linkid])
goto found;
spin_unlock(&ctrl->loop_linkid_lock);
- pr_err("CAIF: %s(): Out of link-ids\n", __func__);
+ pr_err("Out of link-ids\n");
return -EINVAL;
found:
if (!ctrl->loop_linkused[linkid])
diff --git a/net/caif/cfdbgl.c b/net/caif/cfdbgl.c
index 676648cac8d..496fda9ac66 100644
--- a/net/caif/cfdbgl.c
+++ b/net/caif/cfdbgl.c
@@ -4,6 +4,8 @@
* License terms: GNU General Public License (GPL) version 2
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
#include <linux/stddef.h>
#include <linux/slab.h>
#include <net/caif/caif_layer.h>
@@ -17,7 +19,7 @@ struct cflayer *cfdbgl_create(u8 channel_id, struct dev_info *dev_info)
{
struct cfsrvl *dbg = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
if (!dbg) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return NULL;
}
caif_assert(offsetof(struct cfsrvl, layer) == 0);
diff --git a/net/caif/cfdgml.c b/net/caif/cfdgml.c
index ed9d53aff28..d3ed264ad6c 100644
--- a/net/caif/cfdgml.c
+++ b/net/caif/cfdgml.c
@@ -4,6 +4,8 @@
* License terms: GNU General Public License (GPL) version 2
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
#include <linux/stddef.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
@@ -26,7 +28,7 @@ struct cflayer *cfdgml_create(u8 channel_id, struct dev_info *dev_info)
{
struct cfsrvl *dgm = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
if (!dgm) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return NULL;
}
caif_assert(offsetof(struct cfsrvl, layer) == 0);
@@ -49,14 +51,14 @@ static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt)
caif_assert(layr->ctrlcmd != NULL);
if (cfpkt_extr_head(pkt, &cmd, 1) < 0) {
- pr_err("CAIF: %s(): Packet is erroneous!\n", __func__);
+ pr_err("Packet is erroneous!\n");
cfpkt_destroy(pkt);
return -EPROTO;
}
if ((cmd & DGM_CMD_BIT) == 0) {
if (cfpkt_extr_head(pkt, &dgmhdr, 3) < 0) {
- pr_err("CAIF: %s(): Packet is erroneous!\n", __func__);
+ pr_err("Packet is erroneous!\n");
cfpkt_destroy(pkt);
return -EPROTO;
}
@@ -75,8 +77,7 @@ static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt)
return 0;
default:
cfpkt_destroy(pkt);
- pr_info("CAIF: %s(): Unknown datagram control %d (0x%x)\n",
- __func__, cmd, cmd);
+ pr_info("Unknown datagram control %d (0x%x)\n", cmd, cmd);
return -EPROTO;
}
}
diff --git a/net/caif/cffrml.c b/net/caif/cffrml.c
index e86a4ca3b21..a445043931a 100644
--- a/net/caif/cffrml.c
+++ b/net/caif/cffrml.c
@@ -6,6 +6,8 @@
* License terms: GNU General Public License (GPL) version 2
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
#include <linux/stddef.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
@@ -32,7 +34,7 @@ struct cflayer *cffrml_create(u16 phyid, bool use_fcs)
{
struct cffrml *this = kmalloc(sizeof(struct cffrml), GFP_ATOMIC);
if (!this) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return NULL;
}
caif_assert(offsetof(struct cffrml, layer) == 0);
@@ -83,7 +85,7 @@ static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt)
if (cfpkt_setlen(pkt, len) < 0) {
++cffrml_rcv_error;
- pr_err("CAIF: %s():Framing length error (%d)\n", __func__, len);
+ pr_err("Framing length error (%d)\n", len);
cfpkt_destroy(pkt);
return -EPROTO;
}
@@ -99,14 +101,14 @@ static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt)
cfpkt_add_trail(pkt, &tmp, 2);
++cffrml_rcv_error;
++cffrml_rcv_checsum_error;
- pr_info("CAIF: %s(): Frame checksum error "
- "(0x%x != 0x%x)\n", __func__, hdrchks, pktchks);
+ pr_info("Frame checksum error (0x%x != 0x%x)\n",
+ hdrchks, pktchks);
return -EILSEQ;
}
}
if (cfpkt_erroneous(pkt)) {
++cffrml_rcv_error;
- pr_err("CAIF: %s(): Packet is erroneous!\n", __func__);
+ pr_err("Packet is erroneous!\n");
cfpkt_destroy(pkt);
return -EPROTO;
}
@@ -132,7 +134,7 @@ static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt)
cfpkt_add_head(pkt, &tmp, 2);
cfpkt_info(pkt)->hdr_len += 2;
if (cfpkt_erroneous(pkt)) {
- pr_err("CAIF: %s(): Packet is erroneous!\n", __func__);
+ pr_err("Packet is erroneous!\n");
return -EPROTO;
}
ret = layr->dn->transmit(layr->dn, pkt);
diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c
index 80c8d332b25..46f34b2e047 100644
--- a/net/caif/cfmuxl.c
+++ b/net/caif/cfmuxl.c
@@ -3,6 +3,9 @@
* Author: Sjur Brendeland/sjur.brandeland@stericsson.com
* License terms: GNU General Public License (GPL) version 2
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
#include <linux/stddef.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
@@ -190,7 +193,7 @@ static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt)
u8 id;
struct cflayer *up;
if (cfpkt_extr_head(pkt, &id, 1) < 0) {
- pr_err("CAIF: %s(): erroneous Caif Packet\n", __func__);
+ pr_err("erroneous Caif Packet\n");
cfpkt_destroy(pkt);
return -EPROTO;
}
@@ -199,8 +202,8 @@ static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt)
up = get_up(muxl, id);
spin_unlock(&muxl->receive_lock);
if (up == NULL) {
- pr_info("CAIF: %s():Received data on unknown link ID = %d "
- "(0x%x) up == NULL", __func__, id, id);
+ pr_info("Received data on unknown link ID = %d (0x%x) up == NULL",
+ id, id);
cfpkt_destroy(pkt);
/*
* Don't return ERROR, since modem misbehaves and sends out
@@ -223,9 +226,8 @@ static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt)
struct caif_payload_info *info = cfpkt_info(pkt);
dn = get_dn(muxl, cfpkt_info(pkt)->dev_info);
if (dn == NULL) {
- pr_warning("CAIF: %s(): Send data on unknown phy "
- "ID = %d (0x%x)\n",
- __func__, info->dev_info->id, info->dev_info->id);
+ pr_warn("Send data on unknown phy ID = %d (0x%x)\n",
+ info->dev_info->id, info->dev_info->id);
return -ENOTCONN;
}
info->hdr_len += 1;
diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c
index 01f238ff234..d7e865e2ff6 100644
--- a/net/caif/cfpkt_skbuff.c
+++ b/net/caif/cfpkt_skbuff.c
@@ -4,19 +4,22 @@
* License terms: GNU General Public License (GPL) version 2
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/hardirq.h>
#include <net/caif/cfpkt.h>
-#define PKT_PREFIX 16
+#define PKT_PREFIX 48
#define PKT_POSTFIX 2
#define PKT_LEN_WHEN_EXTENDING 128
-#define PKT_ERROR(pkt, errmsg) do { \
- cfpkt_priv(pkt)->erronous = true; \
- skb_reset_tail_pointer(&pkt->skb); \
- pr_warning("CAIF: " errmsg);\
- } while (0)
+#define PKT_ERROR(pkt, errmsg) \
+do { \
+ cfpkt_priv(pkt)->erronous = true; \
+ skb_reset_tail_pointer(&pkt->skb); \
+ pr_warn(errmsg); \
+} while (0)
struct cfpktq {
struct sk_buff_head head;
@@ -130,13 +133,13 @@ int cfpkt_extr_head(struct cfpkt *pkt, void *data, u16 len)
return -EPROTO;
if (unlikely(len > skb->len)) {
- PKT_ERROR(pkt, "cfpkt_extr_head read beyond end of packet\n");
+ PKT_ERROR(pkt, "read beyond end of packet\n");
return -EPROTO;
}
if (unlikely(len > skb_headlen(skb))) {
if (unlikely(skb_linearize(skb) != 0)) {
- PKT_ERROR(pkt, "cfpkt_extr_head linearize failed\n");
+ PKT_ERROR(pkt, "linearize failed\n");
return -EPROTO;
}
}
@@ -156,11 +159,11 @@ int cfpkt_extr_trail(struct cfpkt *pkt, void *dta, u16 len)
return -EPROTO;
if (unlikely(skb_linearize(skb) != 0)) {
- PKT_ERROR(pkt, "cfpkt_extr_trail linearize failed\n");
+ PKT_ERROR(pkt, "linearize failed\n");
return -EPROTO;
}
if (unlikely(skb->data + len > skb_tail_pointer(skb))) {
- PKT_ERROR(pkt, "cfpkt_extr_trail read beyond end of packet\n");
+ PKT_ERROR(pkt, "read beyond end of packet\n");
return -EPROTO;
}
from = skb_tail_pointer(skb) - len;
@@ -202,7 +205,7 @@ int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len)
/* Make sure data is writable */
if (unlikely(skb_cow_data(skb, addlen, &lastskb) < 0)) {
- PKT_ERROR(pkt, "cfpkt_add_body: cow failed\n");
+ PKT_ERROR(pkt, "cow failed\n");
return -EPROTO;
}
/*
@@ -211,8 +214,7 @@ int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len)
* lengths of the top SKB.
*/
if (lastskb != skb) {
- pr_warning("CAIF: %s(): Packet is non-linear\n",
- __func__);
+ pr_warn("Packet is non-linear\n");
skb->len += len;
skb->data_len += len;
}
@@ -242,14 +244,14 @@ int cfpkt_add_head(struct cfpkt *pkt, const void *data2, u16 len)
if (unlikely(is_erronous(pkt)))
return -EPROTO;
if (unlikely(skb_headroom(skb) < len)) {
- PKT_ERROR(pkt, "cfpkt_add_head: no headroom\n");
+ PKT_ERROR(pkt, "no headroom\n");
return -EPROTO;
}
/* Make sure data is writable */
ret = skb_cow_data(skb, 0, &lastskb);
if (unlikely(ret < 0)) {
- PKT_ERROR(pkt, "cfpkt_add_head: cow failed\n");
+ PKT_ERROR(pkt, "cow failed\n");
return ret;
}
@@ -283,7 +285,7 @@ inline u16 cfpkt_iterate(struct cfpkt *pkt,
if (unlikely(is_erronous(pkt)))
return -EPROTO;
if (unlikely(skb_linearize(&pkt->skb) != 0)) {
- PKT_ERROR(pkt, "cfpkt_iterate: linearize failed\n");
+ PKT_ERROR(pkt, "linearize failed\n");
return -EPROTO;
}
return iter_func(data, pkt->skb.data, cfpkt_getlen(pkt));
@@ -309,7 +311,7 @@ int cfpkt_setlen(struct cfpkt *pkt, u16 len)
/* Need to expand SKB */
if (unlikely(!cfpkt_pad_trail(pkt, len - skb->len)))
- PKT_ERROR(pkt, "cfpkt_setlen: skb_pad_trail failed\n");
+ PKT_ERROR(pkt, "skb_pad_trail failed\n");
return cfpkt_getlen(pkt);
}
@@ -380,8 +382,7 @@ struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos)
return NULL;
if (skb->data + pos > skb_tail_pointer(skb)) {
- PKT_ERROR(pkt,
- "cfpkt_split: trying to split beyond end of packet");
+ PKT_ERROR(pkt, "trying to split beyond end of packet\n");
return NULL;
}
@@ -455,17 +456,17 @@ int cfpkt_raw_append(struct cfpkt *pkt, void **buf, unsigned int buflen)
return -EPROTO;
/* Make sure SKB is writable */
if (unlikely(skb_cow_data(skb, 0, &lastskb) < 0)) {
- PKT_ERROR(pkt, "cfpkt_raw_append: skb_cow_data failed\n");
+ PKT_ERROR(pkt, "skb_cow_data failed\n");
return -EPROTO;
}
if (unlikely(skb_linearize(skb) != 0)) {
- PKT_ERROR(pkt, "cfpkt_raw_append: linearize failed\n");
+ PKT_ERROR(pkt, "linearize failed\n");
return -EPROTO;
}
if (unlikely(skb_tailroom(skb) < buflen)) {
- PKT_ERROR(pkt, "cfpkt_raw_append: buffer too short - failed\n");
+ PKT_ERROR(pkt, "buffer too short - failed\n");
return -EPROTO;
}
@@ -483,14 +484,13 @@ int cfpkt_raw_extract(struct cfpkt *pkt, void **buf, unsigned int buflen)
return -EPROTO;
if (unlikely(buflen > skb->len)) {
- PKT_ERROR(pkt, "cfpkt_raw_extract: buflen too large "
- "- failed\n");
+ PKT_ERROR(pkt, "buflen too large - failed\n");
return -EPROTO;
}
if (unlikely(buflen > skb_headlen(skb))) {
if (unlikely(skb_linearize(skb) != 0)) {
- PKT_ERROR(pkt, "cfpkt_raw_extract: linearize failed\n");
+ PKT_ERROR(pkt, "linearize failed\n");
return -EPROTO;
}
}
diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c
index eb1602022ac..bde8481e8d2 100644
--- a/net/caif/cfrfml.c
+++ b/net/caif/cfrfml.c
@@ -4,10 +4,12 @@
* License terms: GNU General Public License (GPL) version 2
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
#include <linux/stddef.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
-#include <linux/unaligned/le_byteshift.h>
+#include <asm/unaligned.h>
#include <net/caif/caif_layer.h>
#include <net/caif/cfsrvl.h>
#include <net/caif/cfpkt.h>
@@ -48,7 +50,7 @@ struct cflayer *cfrfml_create(u8 channel_id, struct dev_info *dev_info,
kzalloc(sizeof(struct cfrfml), GFP_ATOMIC);
if (!this) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return NULL;
}
@@ -178,9 +180,7 @@ out:
cfpkt_destroy(rfml->incomplete_frm);
rfml->incomplete_frm = NULL;
- pr_info("CAIF: %s(): "
- "Connection error %d triggered on RFM link\n",
- __func__, err);
+ pr_info("Connection error %d triggered on RFM link\n", err);
/* Trigger connection error upon failure.*/
layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND,
@@ -280,9 +280,7 @@ static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt)
out:
if (err != 0) {
- pr_info("CAIF: %s(): "
- "Connection error %d triggered on RFM link\n",
- __func__, err);
+ pr_info("Connection error %d triggered on RFM link\n", err);
/* Trigger connection error upon failure.*/
layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND,
diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c
index a11fbd68a13..9297f7dea9d 100644
--- a/net/caif/cfserl.c
+++ b/net/caif/cfserl.c
@@ -4,6 +4,8 @@
* License terms: GNU General Public License (GPL) version 2
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
#include <linux/stddef.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
@@ -34,7 +36,7 @@ struct cflayer *cfserl_create(int type, int instance, bool use_stx)
{
struct cfserl *this = kmalloc(sizeof(struct cfserl), GFP_ATOMIC);
if (!this) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return NULL;
}
caif_assert(offsetof(struct cfserl, layer) == 0);
diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c
index f40939a9121..ab5e542526b 100644
--- a/net/caif/cfsrvl.c
+++ b/net/caif/cfsrvl.c
@@ -4,6 +4,8 @@
* License terms: GNU General Public License (GPL) version 2
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/errno.h>
@@ -79,8 +81,7 @@ static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
layr->up->ctrlcmd(layr->up, ctrl, phyid);
break;
default:
- pr_warning("CAIF: %s(): "
- "Unexpected ctrl in cfsrvl (%d)\n", __func__, ctrl);
+ pr_warn("Unexpected ctrl in cfsrvl (%d)\n", ctrl);
/* We have both modem and phy flow on, send flow on */
layr->up->ctrlcmd(layr->up, ctrl, phyid);
service->phy_flow_on = true;
@@ -107,14 +108,12 @@ static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl)
u8 flow_on = SRVL_FLOW_ON;
pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE);
if (!pkt) {
- pr_warning("CAIF: %s(): Out of memory\n",
- __func__);
+ pr_warn("Out of memory\n");
return -ENOMEM;
}
if (cfpkt_add_head(pkt, &flow_on, 1) < 0) {
- pr_err("CAIF: %s(): Packet is erroneous!\n",
- __func__);
+ pr_err("Packet is erroneous!\n");
cfpkt_destroy(pkt);
return -EPROTO;
}
@@ -131,14 +130,12 @@ static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl)
u8 flow_off = SRVL_FLOW_OFF;
pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE);
if (!pkt) {
- pr_warning("CAIF: %s(): Out of memory\n",
- __func__);
+ pr_warn("Out of memory\n");
return -ENOMEM;
}
if (cfpkt_add_head(pkt, &flow_off, 1) < 0) {
- pr_err("CAIF: %s(): Packet is erroneous!\n",
- __func__);
+ pr_err("Packet is erroneous!\n");
cfpkt_destroy(pkt);
return -EPROTO;
}
diff --git a/net/caif/cfutill.c b/net/caif/cfutill.c
index 02795aff57a..efad410e4c8 100644
--- a/net/caif/cfutill.c
+++ b/net/caif/cfutill.c
@@ -4,6 +4,8 @@
* License terms: GNU General Public License (GPL) version 2
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
@@ -26,7 +28,7 @@ struct cflayer *cfutill_create(u8 channel_id, struct dev_info *dev_info)
{
struct cfsrvl *util = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
if (!util) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return NULL;
}
caif_assert(offsetof(struct cfsrvl, layer) == 0);
@@ -47,7 +49,7 @@ static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt)
caif_assert(layr->up->receive != NULL);
caif_assert(layr->up->ctrlcmd != NULL);
if (cfpkt_extr_head(pkt, &cmd, 1) < 0) {
- pr_err("CAIF: %s(): Packet is erroneous!\n", __func__);
+ pr_err("Packet is erroneous!\n");
cfpkt_destroy(pkt);
return -EPROTO;
}
@@ -64,16 +66,14 @@ static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt)
cfpkt_destroy(pkt);
return 0;
case UTIL_REMOTE_SHUTDOWN: /* Remote Shutdown Request */
- pr_err("CAIF: %s(): REMOTE SHUTDOWN REQUEST RECEIVED\n",
- __func__);
+ pr_err("REMOTE SHUTDOWN REQUEST RECEIVED\n");
layr->ctrlcmd(layr, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, 0);
service->open = false;
cfpkt_destroy(pkt);
return 0;
default:
cfpkt_destroy(pkt);
- pr_warning("CAIF: %s(): Unknown service control %d (0x%x)\n",
- __func__, cmd, cmd);
+ pr_warn("Unknown service control %d (0x%x)\n", cmd, cmd);
return -EPROTO;
}
}
diff --git a/net/caif/cfveil.c b/net/caif/cfveil.c
index 77cc09faac9..3b425b189a9 100644
--- a/net/caif/cfveil.c
+++ b/net/caif/cfveil.c
@@ -4,6 +4,8 @@
* License terms: GNU General Public License (GPL) version 2
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
#include <linux/stddef.h>
#include <linux/slab.h>
#include <net/caif/caif_layer.h>
@@ -25,7 +27,7 @@ struct cflayer *cfvei_create(u8 channel_id, struct dev_info *dev_info)
{
struct cfsrvl *vei = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
if (!vei) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return NULL;
}
caif_assert(offsetof(struct cfsrvl, layer) == 0);
@@ -47,7 +49,7 @@ static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt)
if (cfpkt_extr_head(pkt, &cmd, 1) < 0) {
- pr_err("CAIF: %s(): Packet is erroneous!\n", __func__);
+ pr_err("Packet is erroneous!\n");
cfpkt_destroy(pkt);
return -EPROTO;
}
@@ -67,8 +69,7 @@ static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt)
cfpkt_destroy(pkt);
return 0;
default: /* SET RS232 PIN */
- pr_warning("CAIF: %s():Unknown VEI control packet %d (0x%x)!\n",
- __func__, cmd, cmd);
+ pr_warn("Unknown VEI control packet %d (0x%x)!\n", cmd, cmd);
cfpkt_destroy(pkt);
return -EPROTO;
}
@@ -86,7 +87,7 @@ static int cfvei_transmit(struct cflayer *layr, struct cfpkt *pkt)
caif_assert(layr->dn->transmit != NULL);
if (cfpkt_add_head(pkt, &tmp, 1) < 0) {
- pr_err("CAIF: %s(): Packet is erroneous!\n", __func__);
+ pr_err("Packet is erroneous!\n");
return -EPROTO;
}
diff --git a/net/caif/cfvidl.c b/net/caif/cfvidl.c
index ada6ee2d48f..bf6fef2a0ef 100644
--- a/net/caif/cfvidl.c
+++ b/net/caif/cfvidl.c
@@ -4,6 +4,8 @@
* License terms: GNU General Public License (GPL) version 2
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
@@ -21,7 +23,7 @@ struct cflayer *cfvidl_create(u8 channel_id, struct dev_info *dev_info)
{
struct cfsrvl *vid = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
if (!vid) {
- pr_warning("CAIF: %s(): Out of memory\n", __func__);
+ pr_warn("Out of memory\n");
return NULL;
}
caif_assert(offsetof(struct cfsrvl, layer) == 0);
@@ -38,7 +40,7 @@ static int cfvidl_receive(struct cflayer *layr, struct cfpkt *pkt)
{
u32 videoheader;
if (cfpkt_extr_head(pkt, &videoheader, 4) < 0) {
- pr_err("CAIF: %s(): Packet is erroneous!\n", __func__);
+ pr_err("Packet is erroneous!\n");
cfpkt_destroy(pkt);
return -EPROTO;
}
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 4293e190ec5..84a422c9894 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -5,6 +5,8 @@
* License terms: GNU General Public License (GPL) version 2
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
#include <linux/version.h>
#include <linux/fs.h>
#include <linux/init.h>
@@ -28,9 +30,6 @@
#define CONNECT_TIMEOUT (5 * HZ)
#define CAIF_NET_DEFAULT_QUEUE_LEN 500
-#undef pr_debug
-#define pr_debug pr_warning
-
/*This list is protected by the rtnl lock. */
static LIST_HEAD(chnl_net_list);
@@ -142,8 +141,7 @@ static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow,
int phyid)
{
struct chnl_net *priv = container_of(layr, struct chnl_net, chnl);
- pr_debug("CAIF: %s(): NET flowctrl func called flow: %s\n",
- __func__,
+ pr_debug("NET flowctrl func called flow: %s\n",
flow == CAIF_CTRLCMD_FLOW_ON_IND ? "ON" :
flow == CAIF_CTRLCMD_INIT_RSP ? "INIT" :
flow == CAIF_CTRLCMD_FLOW_OFF_IND ? "OFF" :
@@ -196,12 +194,12 @@ static int chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev)
priv = netdev_priv(dev);
if (skb->len > priv->netdev->mtu) {
- pr_warning("CAIF: %s(): Size of skb exceeded MTU\n", __func__);
+ pr_warn("Size of skb exceeded MTU\n");
return -ENOSPC;
}
if (!priv->flowenabled) {
- pr_debug("CAIF: %s(): dropping packets flow off\n", __func__);
+ pr_debug("dropping packets flow off\n");
return NETDEV_TX_BUSY;
}
@@ -237,7 +235,7 @@ static int chnl_net_open(struct net_device *dev)
ASSERT_RTNL();
priv = netdev_priv(dev);
if (!priv) {
- pr_debug("CAIF: %s(): chnl_net_open: no priv\n", __func__);
+ pr_debug("chnl_net_open: no priv\n");
return -ENODEV;
}
@@ -246,18 +244,17 @@ static int chnl_net_open(struct net_device *dev)
result = caif_connect_client(&priv->conn_req, &priv->chnl,
&llifindex, &headroom, &tailroom);
if (result != 0) {
- pr_debug("CAIF: %s(): err: "
- "Unable to register and open device,"
- " Err:%d\n",
- __func__,
- result);
+ pr_debug("err: "
+ "Unable to register and open device,"
+ " Err:%d\n",
+ result);
goto error;
}
lldev = dev_get_by_index(dev_net(dev), llifindex);
if (lldev == NULL) {
- pr_debug("CAIF: %s(): no interface?\n", __func__);
+ pr_debug("no interface?\n");
result = -ENODEV;
goto error;
}
@@ -279,9 +276,7 @@ static int chnl_net_open(struct net_device *dev)
dev_put(lldev);
if (mtu < 100) {
- pr_warning("CAIF: %s(): "
- "CAIF Interface MTU too small (%d)\n",
- __func__, mtu);
+ pr_warn("CAIF Interface MTU too small (%d)\n", mtu);
result = -ENODEV;
goto error;
}
@@ -296,33 +291,32 @@ static int chnl_net_open(struct net_device *dev)
rtnl_lock();
if (result == -ERESTARTSYS) {
- pr_debug("CAIF: %s(): wait_event_interruptible"
- " woken by a signal\n", __func__);
+ pr_debug("wait_event_interruptible woken by a signal\n");
result = -ERESTARTSYS;
goto error;
}
if (result == 0) {
- pr_debug("CAIF: %s(): connect timeout\n", __func__);
+ pr_debug("connect timeout\n");
caif_disconnect_client(&priv->chnl);
priv->state = CAIF_DISCONNECTED;
- pr_debug("CAIF: %s(): state disconnected\n", __func__);
+ pr_debug("state disconnected\n");
result = -ETIMEDOUT;
goto error;
}
if (priv->state != CAIF_CONNECTED) {
- pr_debug("CAIF: %s(): connect failed\n", __func__);
+ pr_debug("connect failed\n");
result = -ECONNREFUSED;
goto error;
}
- pr_debug("CAIF: %s(): CAIF Netdevice connected\n", __func__);
+ pr_debug("CAIF Netdevice connected\n");
return 0;
error:
caif_disconnect_client(&priv->chnl);
priv->state = CAIF_DISCONNECTED;
- pr_debug("CAIF: %s(): state disconnected\n", __func__);
+ pr_debug("state disconnected\n");
return result;
}
@@ -413,7 +407,7 @@ static void caif_netlink_parms(struct nlattr *data[],
struct caif_connect_request *conn_req)
{
if (!data) {
- pr_warning("CAIF: %s: no params data found\n", __func__);
+ pr_warn("no params data found\n");
return;
}
if (data[IFLA_CAIF_IPV4_CONNID])
@@ -442,8 +436,7 @@ static int ipcaif_newlink(struct net *src_net, struct net_device *dev,
ret = register_netdevice(dev);
if (ret)
- pr_warning("CAIF: %s(): device rtml registration failed\n",
- __func__);
+ pr_warn("device rtml registration failed\n");
return ret;
}
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 9c65e9deb9c..08ffe9e4be2 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -60,6 +60,13 @@
#include <net/sock.h>
#include <net/net_namespace.h>
+/*
+ * To send multiple CAN frame content within TX_SETUP or to filter
+ * CAN messages with multiplex index within RX_SETUP, the number of
+ * different filters is limited to 256 due to the one byte index value.
+ */
+#define MAX_NFRAMES 256
+
/* use of last_frames[index].can_dlc */
#define RX_RECV 0x40 /* received data for this element */
#define RX_THR 0x80 /* element not been sent due to throttle feature */
@@ -89,16 +96,16 @@ struct bcm_op {
struct list_head list;
int ifindex;
canid_t can_id;
- int flags;
+ u32 flags;
unsigned long frames_abs, frames_filtered;
struct timeval ival1, ival2;
struct hrtimer timer, thrtimer;
struct tasklet_struct tsklet, thrtsklet;
ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg;
int rx_ifindex;
- int count;
- int nframes;
- int currframe;
+ u32 count;
+ u32 nframes;
+ u32 currframe;
struct can_frame *frames;
struct can_frame *last_frames;
struct can_frame sframe;
@@ -175,7 +182,7 @@ static int bcm_proc_show(struct seq_file *m, void *v)
seq_printf(m, "rx_op: %03X %-5s ",
op->can_id, bcm_proc_getifname(ifname, op->ifindex));
- seq_printf(m, "[%d]%c ", op->nframes,
+ seq_printf(m, "[%u]%c ", op->nframes,
(op->flags & RX_CHECK_DLC)?'d':' ');
if (op->kt_ival1.tv64)
seq_printf(m, "timeo=%lld ",
@@ -198,7 +205,7 @@ static int bcm_proc_show(struct seq_file *m, void *v)
list_for_each_entry(op, &bo->tx_ops, list) {
- seq_printf(m, "tx_op: %03X %s [%d] ",
+ seq_printf(m, "tx_op: %03X %s [%u] ",
op->can_id,
bcm_proc_getifname(ifname, op->ifindex),
op->nframes);
@@ -283,7 +290,7 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
struct can_frame *firstframe;
struct sockaddr_can *addr;
struct sock *sk = op->sk;
- int datalen = head->nframes * CFSIZ;
+ unsigned int datalen = head->nframes * CFSIZ;
int err;
skb = alloc_skb(sizeof(*head) + datalen, gfp_any());
@@ -468,7 +475,7 @@ rx_changed_settime:
* bcm_rx_cmp_to_index - (bit)compares the currently received data to formerly
* received data stored in op->last_frames[]
*/
-static void bcm_rx_cmp_to_index(struct bcm_op *op, int index,
+static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index,
const struct can_frame *rxdata)
{
/*
@@ -554,7 +561,8 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
/*
* bcm_rx_do_flush - helper for bcm_rx_thr_flush
*/
-static inline int bcm_rx_do_flush(struct bcm_op *op, int update, int index)
+static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
+ unsigned int index)
{
if ((op->last_frames) && (op->last_frames[index].can_dlc & RX_THR)) {
if (update)
@@ -575,7 +583,7 @@ static int bcm_rx_thr_flush(struct bcm_op *op, int update)
int updated = 0;
if (op->nframes > 1) {
- int i;
+ unsigned int i;
/* for MUX filter we start at index 1 */
for (i = 1; i < op->nframes; i++)
@@ -624,7 +632,7 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data)
{
struct bcm_op *op = (struct bcm_op *)data;
const struct can_frame *rxframe = (struct can_frame *)skb->data;
- int i;
+ unsigned int i;
/* disable timeout */
hrtimer_cancel(&op->timer);
@@ -822,14 +830,15 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
{
struct bcm_sock *bo = bcm_sk(sk);
struct bcm_op *op;
- int i, err;
+ unsigned int i;
+ int err;
/* we need a real device to send frames */
if (!ifindex)
return -ENODEV;
- /* we need at least one can_frame */
- if (msg_head->nframes < 1)
+ /* check nframes boundaries - we need at least one can_frame */
+ if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES)
return -EINVAL;
/* check the given can_id */
@@ -993,6 +1002,10 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
msg_head->nframes = 0;
}
+ /* the first element contains the mux-mask => MAX_NFRAMES + 1 */
+ if (msg_head->nframes > MAX_NFRAMES + 1)
+ return -EINVAL;
+
if ((msg_head->flags & RX_RTR_FRAME) &&
((msg_head->nframes != 1) ||
(!(msg_head->can_id & CAN_RTR_FLAG))))
diff --git a/net/can/raw.c b/net/can/raw.c
index a10e3338f08..e88f610fdb7 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -90,23 +90,39 @@ struct raw_sock {
can_err_mask_t err_mask;
};
+/*
+ * Return pointer to store the extra msg flags for raw_recvmsg().
+ * We use the space of one unsigned int beyond the 'struct sockaddr_can'
+ * in skb->cb.
+ */
+static inline unsigned int *raw_flags(struct sk_buff *skb)
+{
+ BUILD_BUG_ON(sizeof(skb->cb) <= (sizeof(struct sockaddr_can) +
+ sizeof(unsigned int)));
+
+ /* return pointer after struct sockaddr_can */
+ return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]);
+}
+
static inline struct raw_sock *raw_sk(const struct sock *sk)
{
return (struct raw_sock *)sk;
}
-static void raw_rcv(struct sk_buff *skb, void *data)
+static void raw_rcv(struct sk_buff *oskb, void *data)
{
struct sock *sk = (struct sock *)data;
struct raw_sock *ro = raw_sk(sk);
struct sockaddr_can *addr;
+ struct sk_buff *skb;
+ unsigned int *pflags;
/* check the received tx sock reference */
- if (!ro->recv_own_msgs && skb->sk == sk)
+ if (!ro->recv_own_msgs && oskb->sk == sk)
return;
/* clone the given skb to be able to enqueue it into the rcv queue */
- skb = skb_clone(skb, GFP_ATOMIC);
+ skb = skb_clone(oskb, GFP_ATOMIC);
if (!skb)
return;
@@ -123,6 +139,14 @@ static void raw_rcv(struct sk_buff *skb, void *data)
addr->can_family = AF_CAN;
addr->can_ifindex = skb->dev->ifindex;
+ /* add CAN specific message flags for raw_recvmsg() */
+ pflags = raw_flags(skb);
+ *pflags = 0;
+ if (oskb->sk)
+ *pflags |= MSG_DONTROUTE;
+ if (oskb->sk == sk)
+ *pflags |= MSG_CONFIRM;
+
if (sock_queue_rcv_skb(sk, skb) < 0)
kfree_skb(skb);
}
@@ -647,12 +671,12 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock,
err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
if (err < 0)
goto free_skb;
- err = sock_tx_timestamp(msg, sk, skb_tx(skb));
+ err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
if (err < 0)
goto free_skb;
/* to be able to check the received tx sock reference in raw_rcv() */
- skb_tx(skb)->prevent_sk_orphan = 1;
+ skb_shinfo(skb)->tx_flags |= SKBTX_DRV_NEEDS_SK_REF;
skb->dev = dev;
skb->sk = sk;
@@ -707,6 +731,9 @@ static int raw_recvmsg(struct kiocb *iocb, struct socket *sock,
memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
}
+ /* assign the flags that have been recorded in raw_rcv() */
+ msg->msg_flags |= *(raw_flags(skb));
+
skb_free_datagram(sk, skb);
return size;
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
new file mode 100644
index 00000000000..ad424049b0c
--- /dev/null
+++ b/net/ceph/Kconfig
@@ -0,0 +1,28 @@
+config CEPH_LIB
+ tristate "Ceph core library (EXPERIMENTAL)"
+ depends on INET && EXPERIMENTAL
+ select LIBCRC32C
+ select CRYPTO_AES
+ select CRYPTO
+ default n
+ help
+ Choose Y or M here to include cephlib, which provides the
+ common functionality to both the Ceph filesystem and
+ to the rados block device (rbd).
+
+ More information at http://ceph.newdream.net/.
+
+ If unsure, say N.
+
+config CEPH_LIB_PRETTYDEBUG
+ bool "Include file:line in ceph debug output"
+ depends on CEPH_LIB
+ default n
+ help
+ If you say Y here, debug output will include a filename and
+ line to aid debugging. This increases kernel size and slows
+ execution slightly when debug call sites are enabled (e.g.,
+ via CONFIG_DYNAMIC_DEBUG).
+
+ If unsure, say N.
+
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
new file mode 100644
index 00000000000..aab1cabb803
--- /dev/null
+++ b/net/ceph/Makefile
@@ -0,0 +1,37 @@
+#
+# Makefile for CEPH filesystem.
+#
+
+ifneq ($(KERNELRELEASE),)
+
+obj-$(CONFIG_CEPH_LIB) += libceph.o
+
+libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
+ mon_client.o \
+ osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
+ debugfs.o \
+ auth.o auth_none.o \
+ crypto.o armor.o \
+ auth_x.o \
+ ceph_fs.o ceph_strings.o ceph_hash.o \
+ pagevec.o
+
+else
+#Otherwise we were called directly from the command
+# line; invoke the kernel build system.
+
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+default: all
+
+all:
+ $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules
+
+modules_install:
+ $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules_install
+
+clean:
+ $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+
+endif
diff --git a/net/ceph/armor.c b/net/ceph/armor.c
new file mode 100644
index 00000000000..eb2a666b0be
--- /dev/null
+++ b/net/ceph/armor.c
@@ -0,0 +1,103 @@
+
+#include <linux/errno.h>
+
+int ceph_armor(char *dst, const char *src, const char *end);
+int ceph_unarmor(char *dst, const char *src, const char *end);
+
+/*
+ * base64 encode/decode.
+ */
+
+static const char *pem_key =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+static int encode_bits(int c)
+{
+ return pem_key[c];
+}
+
+static int decode_bits(char c)
+{
+ if (c >= 'A' && c <= 'Z')
+ return c - 'A';
+ if (c >= 'a' && c <= 'z')
+ return c - 'a' + 26;
+ if (c >= '0' && c <= '9')
+ return c - '0' + 52;
+ if (c == '+')
+ return 62;
+ if (c == '/')
+ return 63;
+ if (c == '=')
+ return 0; /* just non-negative, please */
+ return -EINVAL;
+}
+
+int ceph_armor(char *dst, const char *src, const char *end)
+{
+ int olen = 0;
+ int line = 0;
+
+ while (src < end) {
+ unsigned char a, b, c;
+
+ a = *src++;
+ *dst++ = encode_bits(a >> 2);
+ if (src < end) {
+ b = *src++;
+ *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
+ if (src < end) {
+ c = *src++;
+ *dst++ = encode_bits(((b & 15) << 2) |
+ (c >> 6));
+ *dst++ = encode_bits(c & 63);
+ } else {
+ *dst++ = encode_bits((b & 15) << 2);
+ *dst++ = '=';
+ }
+ } else {
+ *dst++ = encode_bits(((a & 3) << 4));
+ *dst++ = '=';
+ *dst++ = '=';
+ }
+ olen += 4;
+ line += 4;
+ if (line == 64) {
+ line = 0;
+ *(dst++) = '\n';
+ olen++;
+ }
+ }
+ return olen;
+}
+
+int ceph_unarmor(char *dst, const char *src, const char *end)
+{
+ int olen = 0;
+
+ while (src < end) {
+ int a, b, c, d;
+
+ if (src < end && src[0] == '\n')
+ src++;
+ if (src + 4 > end)
+ return -EINVAL;
+ a = decode_bits(src[0]);
+ b = decode_bits(src[1]);
+ c = decode_bits(src[2]);
+ d = decode_bits(src[3]);
+ if (a < 0 || b < 0 || c < 0 || d < 0)
+ return -EINVAL;
+
+ *dst++ = (a << 2) | (b >> 4);
+ if (src[2] == '=')
+ return olen + 1;
+ *dst++ = ((b & 15) << 4) | (c >> 2);
+ if (src[3] == '=')
+ return olen + 2;
+ *dst++ = ((c & 3) << 6) | d;
+ olen += 3;
+ src += 4;
+ }
+ return olen;
+}
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
new file mode 100644
index 00000000000..549c1f43e1d
--- /dev/null
+++ b/net/ceph/auth.c
@@ -0,0 +1,259 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+#include "auth_none.h"
+#include "auth_x.h"
+
+
+/*
+ * get protocol handler
+ */
+static u32 supported_protocols[] = {
+ CEPH_AUTH_NONE,
+ CEPH_AUTH_CEPHX
+};
+
+static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+{
+ switch (protocol) {
+ case CEPH_AUTH_NONE:
+ return ceph_auth_none_init(ac);
+ case CEPH_AUTH_CEPHX:
+ return ceph_x_init(ac);
+ default:
+ return -ENOENT;
+ }
+}
+
+/*
+ * setup, teardown.
+ */
+struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
+{
+ struct ceph_auth_client *ac;
+ int ret;
+
+ dout("auth_init name '%s' secret '%s'\n", name, secret);
+
+ ret = -ENOMEM;
+ ac = kzalloc(sizeof(*ac), GFP_NOFS);
+ if (!ac)
+ goto out;
+
+ ac->negotiating = true;
+ if (name)
+ ac->name = name;
+ else
+ ac->name = CEPH_AUTH_NAME_DEFAULT;
+ dout("auth_init name %s secret %s\n", ac->name, secret);
+ ac->secret = secret;
+ return ac;
+
+out:
+ return ERR_PTR(ret);
+}
+
+void ceph_auth_destroy(struct ceph_auth_client *ac)
+{
+ dout("auth_destroy %p\n", ac);
+ if (ac->ops)
+ ac->ops->destroy(ac);
+ kfree(ac);
+}
+
+/*
+ * Reset occurs when reconnecting to the monitor.
+ */
+void ceph_auth_reset(struct ceph_auth_client *ac)
+{
+ dout("auth_reset %p\n", ac);
+ if (ac->ops && !ac->negotiating)
+ ac->ops->reset(ac);
+ ac->negotiating = true;
+}
+
+int ceph_entity_name_encode(const char *name, void **p, void *end)
+{
+ int len = strlen(name);
+
+ if (*p + 2*sizeof(u32) + len > end)
+ return -ERANGE;
+ ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
+ ceph_encode_32(p, len);
+ ceph_encode_copy(p, name, len);
+ return 0;
+}
+
+/*
+ * Initiate protocol negotiation with monitor. Include entity name
+ * and list supported protocols.
+ */
+int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
+{
+ struct ceph_mon_request_header *monhdr = buf;
+ void *p = monhdr + 1, *end = buf + len, *lenp;
+ int i, num;
+ int ret;
+
+ dout("auth_build_hello\n");
+ monhdr->have_version = 0;
+ monhdr->session_mon = cpu_to_le16(-1);
+ monhdr->session_mon_tid = 0;
+
+ ceph_encode_32(&p, 0); /* no protocol, yet */
+
+ lenp = p;
+ p += sizeof(u32);
+
+ ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+ ceph_encode_8(&p, 1);
+ num = ARRAY_SIZE(supported_protocols);
+ ceph_encode_32(&p, num);
+ ceph_decode_need(&p, end, num * sizeof(u32), bad);
+ for (i = 0; i < num; i++)
+ ceph_encode_32(&p, supported_protocols[i]);
+
+ ret = ceph_entity_name_encode(ac->name, &p, end);
+ if (ret < 0)
+ return ret;
+ ceph_decode_need(&p, end, sizeof(u64), bad);
+ ceph_encode_64(&p, ac->global_id);
+
+ ceph_encode_32(&lenp, p - lenp - sizeof(u32));
+ return p - buf;
+
+bad:
+ return -ERANGE;
+}
+
+static int ceph_build_auth_request(struct ceph_auth_client *ac,
+ void *msg_buf, size_t msg_len)
+{
+ struct ceph_mon_request_header *monhdr = msg_buf;
+ void *p = monhdr + 1;
+ void *end = msg_buf + msg_len;
+ int ret;
+
+ monhdr->have_version = 0;
+ monhdr->session_mon = cpu_to_le16(-1);
+ monhdr->session_mon_tid = 0;
+
+ ceph_encode_32(&p, ac->protocol);
+
+ ret = ac->ops->build_request(ac, p + sizeof(u32), end);
+ if (ret < 0) {
+ pr_err("error %d building auth method %s request\n", ret,
+ ac->ops->name);
+ return ret;
+ }
+ dout(" built request %d bytes\n", ret);
+ ceph_encode_32(&p, ret);
+ return p + ret - msg_buf;
+}
+
+/*
+ * Handle auth message from monitor.
+ */
+int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+ void *buf, size_t len,
+ void *reply_buf, size_t reply_len)
+{
+ void *p = buf;
+ void *end = buf + len;
+ int protocol;
+ s32 result;
+ u64 global_id;
+ void *payload, *payload_end;
+ int payload_len;
+ char *result_msg;
+ int result_msg_len;
+ int ret = -EINVAL;
+
+ dout("handle_auth_reply %p %p\n", p, end);
+ ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
+ protocol = ceph_decode_32(&p);
+ result = ceph_decode_32(&p);
+ global_id = ceph_decode_64(&p);
+ payload_len = ceph_decode_32(&p);
+ payload = p;
+ p += payload_len;
+ ceph_decode_need(&p, end, sizeof(u32), bad);
+ result_msg_len = ceph_decode_32(&p);
+ result_msg = p;
+ p += result_msg_len;
+ if (p != end)
+ goto bad;
+
+ dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
+ result_msg, global_id, payload_len);
+
+ payload_end = payload + payload_len;
+
+ if (global_id && ac->global_id != global_id) {
+ dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
+ ac->global_id = global_id;
+ }
+
+ if (ac->negotiating) {
+ /* server does not support our protocols? */
+ if (!protocol && result < 0) {
+ ret = result;
+ goto out;
+ }
+ /* set up (new) protocol handler? */
+ if (ac->protocol && ac->protocol != protocol) {
+ ac->ops->destroy(ac);
+ ac->protocol = 0;
+ ac->ops = NULL;
+ }
+ if (ac->protocol != protocol) {
+ ret = ceph_auth_init_protocol(ac, protocol);
+ if (ret) {
+ pr_err("error %d on auth protocol %d init\n",
+ ret, protocol);
+ goto out;
+ }
+ }
+
+ ac->negotiating = false;
+ }
+
+ ret = ac->ops->handle_reply(ac, result, payload, payload_end);
+ if (ret == -EAGAIN) {
+ return ceph_build_auth_request(ac, reply_buf, reply_len);
+ } else if (ret) {
+ pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
+ return ret;
+ }
+ return 0;
+
+bad:
+ pr_err("failed to decode auth msg\n");
+out:
+ return ret;
+}
+
+int ceph_build_auth(struct ceph_auth_client *ac,
+ void *msg_buf, size_t msg_len)
+{
+ if (!ac->protocol)
+ return ceph_auth_build_hello(ac, msg_buf, msg_len);
+ BUG_ON(!ac->ops);
+ if (ac->ops->should_authenticate(ac))
+ return ceph_build_auth_request(ac, msg_buf, msg_len);
+ return 0;
+}
+
+int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
+{
+ if (!ac->ops)
+ return 0;
+ return ac->ops->is_authenticated(ac);
+}
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
new file mode 100644
index 00000000000..214c2bb43d6
--- /dev/null
+++ b/net/ceph/auth_none.c
@@ -0,0 +1,132 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+
+#include "auth_none.h"
+
+static void reset(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+
+ xi->starting = true;
+ xi->built_authorizer = false;
+}
+
+static void destroy(struct ceph_auth_client *ac)
+{
+ kfree(ac->private);
+ ac->private = NULL;
+}
+
+static int is_authenticated(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+
+ return !xi->starting;
+}
+
+static int should_authenticate(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+
+ return xi->starting;
+}
+
+/*
+ * the generic auth code decode the global_id, and we carry no actual
+ * authenticate state, so nothing happens here.
+ */
+static int handle_reply(struct ceph_auth_client *ac, int result,
+ void *buf, void *end)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+
+ xi->starting = false;
+ return result;
+}
+
+/*
+ * build an 'authorizer' with our entity_name and global_id. we can
+ * reuse a single static copy since it is identical for all services
+ * we connect to.
+ */
+static int ceph_auth_none_create_authorizer(
+ struct ceph_auth_client *ac, int peer_type,
+ struct ceph_authorizer **a,
+ void **buf, size_t *len,
+ void **reply_buf, size_t *reply_len)
+{
+ struct ceph_auth_none_info *ai = ac->private;
+ struct ceph_none_authorizer *au = &ai->au;
+ void *p, *end;
+ int ret;
+
+ if (!ai->built_authorizer) {
+ p = au->buf;
+ end = p + sizeof(au->buf);
+ ceph_encode_8(&p, 1);
+ ret = ceph_entity_name_encode(ac->name, &p, end - 8);
+ if (ret < 0)
+ goto bad;
+ ceph_decode_need(&p, end, sizeof(u64), bad2);
+ ceph_encode_64(&p, ac->global_id);
+ au->buf_len = p - (void *)au->buf;
+ ai->built_authorizer = true;
+ dout("built authorizer len %d\n", au->buf_len);
+ }
+
+ *a = (struct ceph_authorizer *)au;
+ *buf = au->buf;
+ *len = au->buf_len;
+ *reply_buf = au->reply_buf;
+ *reply_len = sizeof(au->reply_buf);
+ return 0;
+
+bad2:
+ ret = -ERANGE;
+bad:
+ return ret;
+}
+
+static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a)
+{
+ /* nothing to do */
+}
+
+static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+ .name = "none",
+ .reset = reset,
+ .destroy = destroy,
+ .is_authenticated = is_authenticated,
+ .should_authenticate = should_authenticate,
+ .handle_reply = handle_reply,
+ .create_authorizer = ceph_auth_none_create_authorizer,
+ .destroy_authorizer = ceph_auth_none_destroy_authorizer,
+};
+
+int ceph_auth_none_init(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi;
+
+ dout("ceph_auth_none_init %p\n", ac);
+ xi = kzalloc(sizeof(*xi), GFP_NOFS);
+ if (!xi)
+ return -ENOMEM;
+
+ xi->starting = true;
+ xi->built_authorizer = false;
+
+ ac->protocol = CEPH_AUTH_NONE;
+ ac->private = xi;
+ ac->ops = &ceph_auth_none_ops;
+ return 0;
+}
+
diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h
new file mode 100644
index 00000000000..ed7d088b1bc
--- /dev/null
+++ b/net/ceph/auth_none.h
@@ -0,0 +1,29 @@
+#ifndef _FS_CEPH_AUTH_NONE_H
+#define _FS_CEPH_AUTH_NONE_H
+
+#include <linux/slab.h>
+#include <linux/ceph/auth.h>
+
+/*
+ * null security mode.
+ *
+ * we use a single static authorizer that simply encodes our entity name
+ * and global id.
+ */
+
+struct ceph_none_authorizer {
+ char buf[128];
+ int buf_len;
+ char reply_buf[0];
+};
+
+struct ceph_auth_none_info {
+ bool starting;
+ bool built_authorizer;
+ struct ceph_none_authorizer au; /* we only need one; it's static */
+};
+
+extern int ceph_auth_none_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
new file mode 100644
index 00000000000..7fd5dfcf6e1
--- /dev/null
+++ b/net/ceph/auth_x.c
@@ -0,0 +1,688 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+
+#include "crypto.h"
+#include "auth_x.h"
+#include "auth_x_protocol.h"
+
+#define TEMP_TICKET_BUF_LEN 256
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
+
+static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi = ac->private;
+ int need;
+
+ ceph_x_validate_tickets(ac, &need);
+ dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
+ ac->want_keys, need, xi->have_keys);
+ return (ac->want_keys & xi->have_keys) == ac->want_keys;
+}
+
+static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi = ac->private;
+ int need;
+
+ ceph_x_validate_tickets(ac, &need);
+ dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
+ ac->want_keys, need, xi->have_keys);
+ return need != 0;
+}
+
+static int ceph_x_encrypt_buflen(int ilen)
+{
+ return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
+ sizeof(u32);
+}
+
+static int ceph_x_encrypt(struct ceph_crypto_key *secret,
+ void *ibuf, int ilen, void *obuf, size_t olen)
+{
+ struct ceph_x_encrypt_header head = {
+ .struct_v = 1,
+ .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
+ };
+ size_t len = olen - sizeof(u32);
+ int ret;
+
+ ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
+ &head, sizeof(head), ibuf, ilen);
+ if (ret)
+ return ret;
+ ceph_encode_32(&obuf, len);
+ return len + sizeof(u32);
+}
+
+static int ceph_x_decrypt(struct ceph_crypto_key *secret,
+ void **p, void *end, void *obuf, size_t olen)
+{
+ struct ceph_x_encrypt_header head;
+ size_t head_len = sizeof(head);
+ int len, ret;
+
+ len = ceph_decode_32(p);
+ if (*p + len > end)
+ return -EINVAL;
+
+ dout("ceph_x_decrypt len %d\n", len);
+ ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
+ *p, len);
+ if (ret)
+ return ret;
+ if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
+ return -EPERM;
+ *p += len;
+ return olen;
+}
+
+/*
+ * get existing (or insert new) ticket handler
+ */
+static struct ceph_x_ticket_handler *
+get_ticket_handler(struct ceph_auth_client *ac, int service)
+{
+ struct ceph_x_ticket_handler *th;
+ struct ceph_x_info *xi = ac->private;
+ struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
+
+ while (*p) {
+ parent = *p;
+ th = rb_entry(parent, struct ceph_x_ticket_handler, node);
+ if (service < th->service)
+ p = &(*p)->rb_left;
+ else if (service > th->service)
+ p = &(*p)->rb_right;
+ else
+ return th;
+ }
+
+ /* add it */
+ th = kzalloc(sizeof(*th), GFP_NOFS);
+ if (!th)
+ return ERR_PTR(-ENOMEM);
+ th->service = service;
+ rb_link_node(&th->node, parent, p);
+ rb_insert_color(&th->node, &xi->ticket_handlers);
+ return th;
+}
+
+static void remove_ticket_handler(struct ceph_auth_client *ac,
+ struct ceph_x_ticket_handler *th)
+{
+ struct ceph_x_info *xi = ac->private;
+
+ dout("remove_ticket_handler %p %d\n", th, th->service);
+ rb_erase(&th->node, &xi->ticket_handlers);
+ ceph_crypto_key_destroy(&th->session_key);
+ if (th->ticket_blob)
+ ceph_buffer_put(th->ticket_blob);
+ kfree(th);
+}
+
+static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
+ struct ceph_crypto_key *secret,
+ void *buf, void *end)
+{
+ struct ceph_x_info *xi = ac->private;
+ int num;
+ void *p = buf;
+ int ret;
+ char *dbuf;
+ char *ticket_buf;
+ u8 reply_struct_v;
+
+ dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
+ if (!dbuf)
+ return -ENOMEM;
+
+ ret = -ENOMEM;
+ ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
+ if (!ticket_buf)
+ goto out_dbuf;
+
+ ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+ reply_struct_v = ceph_decode_8(&p);
+ if (reply_struct_v != 1)
+ goto bad;
+ num = ceph_decode_32(&p);
+ dout("%d tickets\n", num);
+ while (num--) {
+ int type;
+ u8 tkt_struct_v, blob_struct_v;
+ struct ceph_x_ticket_handler *th;
+ void *dp, *dend;
+ int dlen;
+ char is_enc;
+ struct timespec validity;
+ struct ceph_crypto_key old_key;
+ void *tp, *tpend;
+ struct ceph_timespec new_validity;
+ struct ceph_crypto_key new_session_key;
+ struct ceph_buffer *new_ticket_blob;
+ unsigned long new_expires, new_renew_after;
+ u64 new_secret_id;
+
+ ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
+
+ type = ceph_decode_32(&p);
+ dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
+
+ tkt_struct_v = ceph_decode_8(&p);
+ if (tkt_struct_v != 1)
+ goto bad;
+
+ th = get_ticket_handler(ac, type);
+ if (IS_ERR(th)) {
+ ret = PTR_ERR(th);
+ goto out;
+ }
+
+ /* blob for me */
+ dlen = ceph_x_decrypt(secret, &p, end, dbuf,
+ TEMP_TICKET_BUF_LEN);
+ if (dlen <= 0) {
+ ret = dlen;
+ goto out;
+ }
+ dout(" decrypted %d bytes\n", dlen);
+ dend = dbuf + dlen;
+ dp = dbuf;
+
+ tkt_struct_v = ceph_decode_8(&dp);
+ if (tkt_struct_v != 1)
+ goto bad;
+
+ memcpy(&old_key, &th->session_key, sizeof(old_key));
+ ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
+ if (ret)
+ goto out;
+
+ ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
+ ceph_decode_timespec(&validity, &new_validity);
+ new_expires = get_seconds() + validity.tv_sec;
+ new_renew_after = new_expires - (validity.tv_sec / 4);
+ dout(" expires=%lu renew_after=%lu\n", new_expires,
+ new_renew_after);
+
+ /* ticket blob for service */
+ ceph_decode_8_safe(&p, end, is_enc, bad);
+ tp = ticket_buf;
+ if (is_enc) {
+ /* encrypted */
+ dout(" encrypted ticket\n");
+ dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
+ TEMP_TICKET_BUF_LEN);
+ if (dlen < 0) {
+ ret = dlen;
+ goto out;
+ }
+ dlen = ceph_decode_32(&tp);
+ } else {
+ /* unencrypted */
+ ceph_decode_32_safe(&p, end, dlen, bad);
+ ceph_decode_need(&p, end, dlen, bad);
+ ceph_decode_copy(&p, ticket_buf, dlen);
+ }
+ tpend = tp + dlen;
+ dout(" ticket blob is %d bytes\n", dlen);
+ ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
+ blob_struct_v = ceph_decode_8(&tp);
+ new_secret_id = ceph_decode_64(&tp);
+ ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
+ if (ret)
+ goto out;
+
+ /* all is well, update our ticket */
+ ceph_crypto_key_destroy(&th->session_key);
+ if (th->ticket_blob)
+ ceph_buffer_put(th->ticket_blob);
+ th->session_key = new_session_key;
+ th->ticket_blob = new_ticket_blob;
+ th->validity = new_validity;
+ th->secret_id = new_secret_id;
+ th->expires = new_expires;
+ th->renew_after = new_renew_after;
+ dout(" got ticket service %d (%s) secret_id %lld len %d\n",
+ type, ceph_entity_type_name(type), th->secret_id,
+ (int)th->ticket_blob->vec.iov_len);
+ xi->have_keys |= th->service;
+ }
+
+ ret = 0;
+out:
+ kfree(ticket_buf);
+out_dbuf:
+ kfree(dbuf);
+ return ret;
+
+bad:
+ ret = -EINVAL;
+ goto out;
+}
+
+static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
+ struct ceph_x_ticket_handler *th,
+ struct ceph_x_authorizer *au)
+{
+ int maxlen;
+ struct ceph_x_authorize_a *msg_a;
+ struct ceph_x_authorize_b msg_b;
+ void *p, *end;
+ int ret;
+ int ticket_blob_len =
+ (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
+
+ dout("build_authorizer for %s %p\n",
+ ceph_entity_type_name(th->service), au);
+
+ maxlen = sizeof(*msg_a) + sizeof(msg_b) +
+ ceph_x_encrypt_buflen(ticket_blob_len);
+ dout(" need len %d\n", maxlen);
+ if (au->buf && au->buf->alloc_len < maxlen) {
+ ceph_buffer_put(au->buf);
+ au->buf = NULL;
+ }
+ if (!au->buf) {
+ au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
+ if (!au->buf)
+ return -ENOMEM;
+ }
+ au->service = th->service;
+
+ msg_a = au->buf->vec.iov_base;
+ msg_a->struct_v = 1;
+ msg_a->global_id = cpu_to_le64(ac->global_id);
+ msg_a->service_id = cpu_to_le32(th->service);
+ msg_a->ticket_blob.struct_v = 1;
+ msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
+ msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
+ if (ticket_blob_len) {
+ memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
+ th->ticket_blob->vec.iov_len);
+ }
+ dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
+ le64_to_cpu(msg_a->ticket_blob.secret_id));
+
+ p = msg_a + 1;
+ p += ticket_blob_len;
+ end = au->buf->vec.iov_base + au->buf->vec.iov_len;
+
+ get_random_bytes(&au->nonce, sizeof(au->nonce));
+ msg_b.struct_v = 1;
+ msg_b.nonce = cpu_to_le64(au->nonce);
+ ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
+ p, end - p);
+ if (ret < 0)
+ goto out_buf;
+ p += ret;
+ au->buf->vec.iov_len = p - au->buf->vec.iov_base;
+ dout(" built authorizer nonce %llx len %d\n", au->nonce,
+ (int)au->buf->vec.iov_len);
+ BUG_ON(au->buf->vec.iov_len > maxlen);
+ return 0;
+
+out_buf:
+ ceph_buffer_put(au->buf);
+ au->buf = NULL;
+ return ret;
+}
+
+static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
+ void **p, void *end)
+{
+ ceph_decode_need(p, end, 1 + sizeof(u64), bad);
+ ceph_encode_8(p, 1);
+ ceph_encode_64(p, th->secret_id);
+ if (th->ticket_blob) {
+ const char *buf = th->ticket_blob->vec.iov_base;
+ u32 len = th->ticket_blob->vec.iov_len;
+
+ ceph_encode_32_safe(p, end, len, bad);
+ ceph_encode_copy_safe(p, end, buf, len, bad);
+ } else {
+ ceph_encode_32_safe(p, end, 0, bad);
+ }
+
+ return 0;
+bad:
+ return -ERANGE;
+}
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
+{
+ int want = ac->want_keys;
+ struct ceph_x_info *xi = ac->private;
+ int service;
+
+ *pneed = ac->want_keys & ~(xi->have_keys);
+
+ for (service = 1; service <= want; service <<= 1) {
+ struct ceph_x_ticket_handler *th;
+
+ if (!(ac->want_keys & service))
+ continue;
+
+ if (*pneed & service)
+ continue;
+
+ th = get_ticket_handler(ac, service);
+
+ if (IS_ERR(th)) {
+ *pneed |= service;
+ continue;
+ }
+
+ if (get_seconds() >= th->renew_after)
+ *pneed |= service;
+ if (get_seconds() >= th->expires)
+ xi->have_keys &= ~service;
+ }
+}
+
+
+static int ceph_x_build_request(struct ceph_auth_client *ac,
+ void *buf, void *end)
+{
+ struct ceph_x_info *xi = ac->private;
+ int need;
+ struct ceph_x_request_header *head = buf;
+ int ret;
+ struct ceph_x_ticket_handler *th =
+ get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+
+ ceph_x_validate_tickets(ac, &need);
+
+ dout("build_request want %x have %x need %x\n",
+ ac->want_keys, xi->have_keys, need);
+
+ if (need & CEPH_ENTITY_TYPE_AUTH) {
+ struct ceph_x_authenticate *auth = (void *)(head + 1);
+ void *p = auth + 1;
+ struct ceph_x_challenge_blob tmp;
+ char tmp_enc[40];
+ u64 *u;
+
+ if (p > end)
+ return -ERANGE;
+
+ dout(" get_auth_session_key\n");
+ head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
+
+ /* encrypt and hash */
+ get_random_bytes(&auth->client_challenge, sizeof(u64));
+ tmp.client_challenge = auth->client_challenge;
+ tmp.server_challenge = cpu_to_le64(xi->server_challenge);
+ ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
+ tmp_enc, sizeof(tmp_enc));
+ if (ret < 0)
+ return ret;
+
+ auth->struct_v = 1;
+ auth->key = 0;
+ for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
+ auth->key ^= *(__le64 *)u;
+ dout(" server_challenge %llx client_challenge %llx key %llx\n",
+ xi->server_challenge, le64_to_cpu(auth->client_challenge),
+ le64_to_cpu(auth->key));
+
+ /* now encode the old ticket if exists */
+ ret = ceph_x_encode_ticket(th, &p, end);
+ if (ret < 0)
+ return ret;
+
+ return p - buf;
+ }
+
+ if (need) {
+ void *p = head + 1;
+ struct ceph_x_service_ticket_request *req;
+
+ if (p > end)
+ return -ERANGE;
+ head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
+
+ ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
+ if (ret)
+ return ret;
+ ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
+ xi->auth_authorizer.buf->vec.iov_len);
+
+ req = p;
+ req->keys = cpu_to_le32(need);
+ p += sizeof(*req);
+ return p - buf;
+ }
+
+ return 0;
+}
+
+static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
+ void *buf, void *end)
+{
+ struct ceph_x_info *xi = ac->private;
+ struct ceph_x_reply_header *head = buf;
+ struct ceph_x_ticket_handler *th;
+ int len = end - buf;
+ int op;
+ int ret;
+
+ if (result)
+ return result; /* XXX hmm? */
+
+ if (xi->starting) {
+ /* it's a hello */
+ struct ceph_x_server_challenge *sc = buf;
+
+ if (len != sizeof(*sc))
+ return -EINVAL;
+ xi->server_challenge = le64_to_cpu(sc->server_challenge);
+ dout("handle_reply got server challenge %llx\n",
+ xi->server_challenge);
+ xi->starting = false;
+ xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
+ return -EAGAIN;
+ }
+
+ op = le16_to_cpu(head->op);
+ result = le32_to_cpu(head->result);
+ dout("handle_reply op %d result %d\n", op, result);
+ switch (op) {
+ case CEPHX_GET_AUTH_SESSION_KEY:
+ /* verify auth key */
+ ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
+ buf + sizeof(*head), end);
+ break;
+
+ case CEPHX_GET_PRINCIPAL_SESSION_KEY:
+ th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+ ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+ buf + sizeof(*head), end);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ if (ret)
+ return ret;
+ if (ac->want_keys == xi->have_keys)
+ return 0;
+ return -EAGAIN;
+}
+
+static int ceph_x_create_authorizer(
+ struct ceph_auth_client *ac, int peer_type,
+ struct ceph_authorizer **a,
+ void **buf, size_t *len,
+ void **reply_buf, size_t *reply_len)
+{
+ struct ceph_x_authorizer *au;
+ struct ceph_x_ticket_handler *th;
+ int ret;
+
+ th = get_ticket_handler(ac, peer_type);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+
+ au = kzalloc(sizeof(*au), GFP_NOFS);
+ if (!au)
+ return -ENOMEM;
+
+ ret = ceph_x_build_authorizer(ac, th, au);
+ if (ret) {
+ kfree(au);
+ return ret;
+ }
+
+ *a = (struct ceph_authorizer *)au;
+ *buf = au->buf->vec.iov_base;
+ *len = au->buf->vec.iov_len;
+ *reply_buf = au->reply_buf;
+ *reply_len = sizeof(au->reply_buf);
+ return 0;
+}
+
+static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a, size_t len)
+{
+ struct ceph_x_authorizer *au = (void *)a;
+ struct ceph_x_ticket_handler *th;
+ int ret = 0;
+ struct ceph_x_authorize_reply reply;
+ void *p = au->reply_buf;
+ void *end = p + sizeof(au->reply_buf);
+
+ th = get_ticket_handler(ac, au->service);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+ ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
+ if (ret < 0)
+ return ret;
+ if (ret != sizeof(reply))
+ return -EPERM;
+
+ if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
+ ret = -EPERM;
+ else
+ ret = 0;
+ dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
+ au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
+ return ret;
+}
+
+static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a)
+{
+ struct ceph_x_authorizer *au = (void *)a;
+
+ ceph_buffer_put(au->buf);
+ kfree(au);
+}
+
+
+static void ceph_x_reset(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi = ac->private;
+
+ dout("reset\n");
+ xi->starting = true;
+ xi->server_challenge = 0;
+}
+
+static void ceph_x_destroy(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi = ac->private;
+ struct rb_node *p;
+
+ dout("ceph_x_destroy %p\n", ac);
+ ceph_crypto_key_destroy(&xi->secret);
+
+ while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
+ struct ceph_x_ticket_handler *th =
+ rb_entry(p, struct ceph_x_ticket_handler, node);
+ remove_ticket_handler(ac, th);
+ }
+
+ if (xi->auth_authorizer.buf)
+ ceph_buffer_put(xi->auth_authorizer.buf);
+
+ kfree(ac->private);
+ ac->private = NULL;
+}
+
+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
+ int peer_type)
+{
+ struct ceph_x_ticket_handler *th;
+
+ th = get_ticket_handler(ac, peer_type);
+ if (!IS_ERR(th))
+ remove_ticket_handler(ac, th);
+}
+
+
+static const struct ceph_auth_client_ops ceph_x_ops = {
+ .name = "x",
+ .is_authenticated = ceph_x_is_authenticated,
+ .should_authenticate = ceph_x_should_authenticate,
+ .build_request = ceph_x_build_request,
+ .handle_reply = ceph_x_handle_reply,
+ .create_authorizer = ceph_x_create_authorizer,
+ .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
+ .destroy_authorizer = ceph_x_destroy_authorizer,
+ .invalidate_authorizer = ceph_x_invalidate_authorizer,
+ .reset = ceph_x_reset,
+ .destroy = ceph_x_destroy,
+};
+
+
+int ceph_x_init(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi;
+ int ret;
+
+ dout("ceph_x_init %p\n", ac);
+ ret = -ENOMEM;
+ xi = kzalloc(sizeof(*xi), GFP_NOFS);
+ if (!xi)
+ goto out;
+
+ ret = -EINVAL;
+ if (!ac->secret) {
+ pr_err("no secret set (for auth_x protocol)\n");
+ goto out_nomem;
+ }
+
+ ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
+ if (ret)
+ goto out_nomem;
+
+ xi->starting = true;
+ xi->ticket_handlers = RB_ROOT;
+
+ ac->protocol = CEPH_AUTH_CEPHX;
+ ac->private = xi;
+ ac->ops = &ceph_x_ops;
+ return 0;
+
+out_nomem:
+ kfree(xi);
+out:
+ return ret;
+}
+
+
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
new file mode 100644
index 00000000000..e02da7a5c5a
--- /dev/null
+++ b/net/ceph/auth_x.h
@@ -0,0 +1,50 @@
+#ifndef _FS_CEPH_AUTH_X_H
+#define _FS_CEPH_AUTH_X_H
+
+#include <linux/rbtree.h>
+
+#include <linux/ceph/auth.h>
+
+#include "crypto.h"
+#include "auth_x_protocol.h"
+
+/*
+ * Handle ticket for a single service.
+ */
+struct ceph_x_ticket_handler {
+ struct rb_node node;
+ unsigned service;
+
+ struct ceph_crypto_key session_key;
+ struct ceph_timespec validity;
+
+ u64 secret_id;
+ struct ceph_buffer *ticket_blob;
+
+ unsigned long renew_after, expires;
+};
+
+
+struct ceph_x_authorizer {
+ struct ceph_buffer *buf;
+ unsigned service;
+ u64 nonce;
+ char reply_buf[128]; /* big enough for encrypted blob */
+};
+
+struct ceph_x_info {
+ struct ceph_crypto_key secret;
+
+ bool starting;
+ u64 server_challenge;
+
+ unsigned have_keys;
+ struct rb_root ticket_handlers;
+
+ struct ceph_x_authorizer auth_authorizer;
+};
+
+extern int ceph_x_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
new file mode 100644
index 00000000000..671d30576c4
--- /dev/null
+++ b/net/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
+#ifndef __FS_CEPH_AUTH_X_PROTOCOL
+#define __FS_CEPH_AUTH_X_PROTOCOL
+
+#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
+#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
+#define CEPHX_GET_ROTATING_KEY 0x0400
+
+/* common bits */
+struct ceph_x_ticket_blob {
+ __u8 struct_v;
+ __le64 secret_id;
+ __le32 blob_len;
+ char blob[];
+} __attribute__ ((packed));
+
+
+/* common request/reply headers */
+struct ceph_x_request_header {
+ __le16 op;
+} __attribute__ ((packed));
+
+struct ceph_x_reply_header {
+ __le16 op;
+ __le32 result;
+} __attribute__ ((packed));
+
+
+/* authenticate handshake */
+
+/* initial hello (no reply header) */
+struct ceph_x_server_challenge {
+ __u8 struct_v;
+ __le64 server_challenge;
+} __attribute__ ((packed));
+
+struct ceph_x_authenticate {
+ __u8 struct_v;
+ __le64 client_challenge;
+ __le64 key;
+ /* ticket blob */
+} __attribute__ ((packed));
+
+struct ceph_x_service_ticket_request {
+ __u8 struct_v;
+ __le32 keys;
+} __attribute__ ((packed));
+
+struct ceph_x_challenge_blob {
+ __le64 server_challenge;
+ __le64 client_challenge;
+} __attribute__ ((packed));
+
+
+
+/* authorize handshake */
+
+/*
+ * The authorizer consists of two pieces:
+ * a - service id, ticket blob
+ * b - encrypted with session key
+ */
+struct ceph_x_authorize_a {
+ __u8 struct_v;
+ __le64 global_id;
+ __le32 service_id;
+ struct ceph_x_ticket_blob ticket_blob;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_b {
+ __u8 struct_v;
+ __le64 nonce;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_reply {
+ __u8 struct_v;
+ __le64 nonce_plus_one;
+} __attribute__ ((packed));
+
+
+/*
+ * encyption bundle
+ */
+#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
+
+struct ceph_x_encrypt_header {
+ __u8 struct_v;
+ __le64 magic;
+} __attribute__ ((packed));
+
+#endif
diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c
new file mode 100644
index 00000000000..53d8abfa25d
--- /dev/null
+++ b/net/ceph/buffer.c
@@ -0,0 +1,68 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/buffer.h>
+#include <linux/ceph/decode.h>
+
+struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
+{
+ struct ceph_buffer *b;
+
+ b = kmalloc(sizeof(*b), gfp);
+ if (!b)
+ return NULL;
+
+ b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+ if (b->vec.iov_base) {
+ b->is_vmalloc = false;
+ } else {
+ b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+ if (!b->vec.iov_base) {
+ kfree(b);
+ return NULL;
+ }
+ b->is_vmalloc = true;
+ }
+
+ kref_init(&b->kref);
+ b->alloc_len = len;
+ b->vec.iov_len = len;
+ dout("buffer_new %p\n", b);
+ return b;
+}
+EXPORT_SYMBOL(ceph_buffer_new);
+
+void ceph_buffer_release(struct kref *kref)
+{
+ struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
+
+ dout("buffer_release %p\n", b);
+ if (b->vec.iov_base) {
+ if (b->is_vmalloc)
+ vfree(b->vec.iov_base);
+ else
+ kfree(b->vec.iov_base);
+ }
+ kfree(b);
+}
+EXPORT_SYMBOL(ceph_buffer_release);
+
+int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
+{
+ size_t len;
+
+ ceph_decode_need(p, end, sizeof(u32), bad);
+ len = ceph_decode_32(p);
+ dout("decode_buffer len %d\n", (int)len);
+ ceph_decode_need(p, end, len, bad);
+ *b = ceph_buffer_new(len, GFP_NOFS);
+ if (!*b)
+ return -ENOMEM;
+ ceph_decode_copy(p, (*b)->vec.iov_base, len);
+ return 0;
+bad:
+ return -EINVAL;
+}
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
new file mode 100644
index 00000000000..f3e4a13fea0
--- /dev/null
+++ b/net/ceph/ceph_common.c
@@ -0,0 +1,529 @@
+
+#include <linux/ceph/ceph_debug.h>
+#include <linux/backing-dev.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/debugfs.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+
+
+
+/*
+ * find filename portion of a path (/foo/bar/baz -> baz)
+ */
+const char *ceph_file_part(const char *s, int len)
+{
+ const char *e = s + len;
+
+ while (e != s && *(e-1) != '/')
+ e--;
+ return e;
+}
+EXPORT_SYMBOL(ceph_file_part);
+
+const char *ceph_msg_type_name(int type)
+{
+ switch (type) {
+ case CEPH_MSG_SHUTDOWN: return "shutdown";
+ case CEPH_MSG_PING: return "ping";
+ case CEPH_MSG_AUTH: return "auth";
+ case CEPH_MSG_AUTH_REPLY: return "auth_reply";
+ case CEPH_MSG_MON_MAP: return "mon_map";
+ case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
+ case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
+ case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
+ case CEPH_MSG_STATFS: return "statfs";
+ case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
+ case CEPH_MSG_MDS_MAP: return "mds_map";
+ case CEPH_MSG_CLIENT_SESSION: return "client_session";
+ case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
+ case CEPH_MSG_CLIENT_REQUEST: return "client_request";
+ case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
+ case CEPH_MSG_CLIENT_REPLY: return "client_reply";
+ case CEPH_MSG_CLIENT_CAPS: return "client_caps";
+ case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
+ case CEPH_MSG_CLIENT_SNAP: return "client_snap";
+ case CEPH_MSG_CLIENT_LEASE: return "client_lease";
+ case CEPH_MSG_OSD_MAP: return "osd_map";
+ case CEPH_MSG_OSD_OP: return "osd_op";
+ case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
+ default: return "unknown";
+ }
+}
+EXPORT_SYMBOL(ceph_msg_type_name);
+
+/*
+ * Initially learn our fsid, or verify an fsid matches.
+ */
+int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+{
+ if (client->have_fsid) {
+ if (ceph_fsid_compare(&client->fsid, fsid)) {
+ pr_err("bad fsid, had %pU got %pU",
+ &client->fsid, fsid);
+ return -1;
+ }
+ } else {
+ pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);
+ memcpy(&client->fsid, fsid, sizeof(*fsid));
+ ceph_debugfs_client_init(client);
+ client->have_fsid = true;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ceph_check_fsid);
+
+static int strcmp_null(const char *s1, const char *s2)
+{
+ if (!s1 && !s2)
+ return 0;
+ if (s1 && !s2)
+ return -1;
+ if (!s1 && s2)
+ return 1;
+ return strcmp(s1, s2);
+}
+
+int ceph_compare_options(struct ceph_options *new_opt,
+ struct ceph_client *client)
+{
+ struct ceph_options *opt1 = new_opt;
+ struct ceph_options *opt2 = client->options;
+ int ofs = offsetof(struct ceph_options, mon_addr);
+ int i;
+ int ret;
+
+ ret = memcmp(opt1, opt2, ofs);
+ if (ret)
+ return ret;
+
+ ret = strcmp_null(opt1->name, opt2->name);
+ if (ret)
+ return ret;
+
+ ret = strcmp_null(opt1->secret, opt2->secret);
+ if (ret)
+ return ret;
+
+ /* any matching mon ip implies a match */
+ for (i = 0; i < opt1->num_mon; i++) {
+ if (ceph_monmap_contains(client->monc.monmap,
+ &opt1->mon_addr[i]))
+ return 0;
+ }
+ return -1;
+}
+EXPORT_SYMBOL(ceph_compare_options);
+
+
+static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+{
+ int i = 0;
+ char tmp[3];
+ int err = -EINVAL;
+ int d;
+
+ dout("parse_fsid '%s'\n", str);
+ tmp[2] = 0;
+ while (*str && i < 16) {
+ if (ispunct(*str)) {
+ str++;
+ continue;
+ }
+ if (!isxdigit(str[0]) || !isxdigit(str[1]))
+ break;
+ tmp[0] = str[0];
+ tmp[1] = str[1];
+ if (sscanf(tmp, "%x", &d) < 1)
+ break;
+ fsid->fsid[i] = d & 0xff;
+ i++;
+ str += 2;
+ }
+
+ if (i == 16)
+ err = 0;
+ dout("parse_fsid ret %d got fsid %pU", err, fsid);
+ return err;
+}
+
+/*
+ * ceph options
+ */
+enum {
+ Opt_osdtimeout,
+ Opt_osdkeepalivetimeout,
+ Opt_mount_timeout,
+ Opt_osd_idle_ttl,
+ Opt_last_int,
+ /* int args above */
+ Opt_fsid,
+ Opt_name,
+ Opt_secret,
+ Opt_ip,
+ Opt_last_string,
+ /* string args above */
+ Opt_noshare,
+ Opt_nocrc,
+};
+
+static match_table_t opt_tokens = {
+ {Opt_osdtimeout, "osdtimeout=%d"},
+ {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
+ {Opt_mount_timeout, "mount_timeout=%d"},
+ {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+ /* int args above */
+ {Opt_fsid, "fsid=%s"},
+ {Opt_name, "name=%s"},
+ {Opt_secret, "secret=%s"},
+ {Opt_ip, "ip=%s"},
+ /* string args above */
+ {Opt_noshare, "noshare"},
+ {Opt_nocrc, "nocrc"},
+ {-1, NULL}
+};
+
+void ceph_destroy_options(struct ceph_options *opt)
+{
+ dout("destroy_options %p\n", opt);
+ kfree(opt->name);
+ kfree(opt->secret);
+ kfree(opt);
+}
+EXPORT_SYMBOL(ceph_destroy_options);
+
+int ceph_parse_options(struct ceph_options **popt, char *options,
+ const char *dev_name, const char *dev_name_end,
+ int (*parse_extra_token)(char *c, void *private),
+ void *private)
+{
+ struct ceph_options *opt;
+ const char *c;
+ int err = -ENOMEM;
+ substring_t argstr[MAX_OPT_ARGS];
+
+ opt = kzalloc(sizeof(*opt), GFP_KERNEL);
+ if (!opt)
+ return err;
+ opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
+ GFP_KERNEL);
+ if (!opt->mon_addr)
+ goto out;
+
+ dout("parse_options %p options '%s' dev_name '%s'\n", opt, options,
+ dev_name);
+
+ /* start with defaults */
+ opt->flags = CEPH_OPT_DEFAULT;
+ opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
+ opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+ opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+ opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
+
+ /* get mon ip(s) */
+ /* ip1[:port1][,ip2[:port2]...] */
+ err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr,
+ CEPH_MAX_MON, &opt->num_mon);
+ if (err < 0)
+ goto out;
+
+ /* parse mount options */
+ while ((c = strsep(&options, ",")) != NULL) {
+ int token, intval, ret;
+ if (!*c)
+ continue;
+ err = -EINVAL;
+ token = match_token((char *)c, opt_tokens, argstr);
+ if (token < 0 && parse_extra_token) {
+ /* extra? */
+ err = parse_extra_token((char *)c, private);
+ if (err < 0) {
+ pr_err("bad option at '%s'\n", c);
+ goto out;
+ }
+ continue;
+ }
+ if (token < Opt_last_int) {
+ ret = match_int(&argstr[0], &intval);
+ if (ret < 0) {
+ pr_err("bad mount option arg (not int) "
+ "at '%s'\n", c);
+ continue;
+ }
+ dout("got int token %d val %d\n", token, intval);
+ } else if (token > Opt_last_int && token < Opt_last_string) {
+ dout("got string token %d val %s\n", token,
+ argstr[0].from);
+ } else {
+ dout("got token %d\n", token);
+ }
+ switch (token) {
+ case Opt_ip:
+ err = ceph_parse_ips(argstr[0].from,
+ argstr[0].to,
+ &opt->my_addr,
+ 1, NULL);
+ if (err < 0)
+ goto out;
+ opt->flags |= CEPH_OPT_MYIP;
+ break;
+
+ case Opt_fsid:
+ err = parse_fsid(argstr[0].from, &opt->fsid);
+ if (err == 0)
+ opt->flags |= CEPH_OPT_FSID;
+ break;
+ case Opt_name:
+ opt->name = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ break;
+ case Opt_secret:
+ opt->secret = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ break;
+
+ /* misc */
+ case Opt_osdtimeout:
+ opt->osd_timeout = intval;
+ break;
+ case Opt_osdkeepalivetimeout:
+ opt->osd_keepalive_timeout = intval;
+ break;
+ case Opt_osd_idle_ttl:
+ opt->osd_idle_ttl = intval;
+ break;
+ case Opt_mount_timeout:
+ opt->mount_timeout = intval;
+ break;
+
+ case Opt_noshare:
+ opt->flags |= CEPH_OPT_NOSHARE;
+ break;
+
+ case Opt_nocrc:
+ opt->flags |= CEPH_OPT_NOCRC;
+ break;
+
+ default:
+ BUG_ON(token);
+ }
+ }
+
+ /* success */
+ *popt = opt;
+ return 0;
+
+out:
+ ceph_destroy_options(opt);
+ return err;
+}
+EXPORT_SYMBOL(ceph_parse_options);
+
+u64 ceph_client_id(struct ceph_client *client)
+{
+ return client->monc.auth->global_id;
+}
+EXPORT_SYMBOL(ceph_client_id);
+
+/*
+ * create a fresh client instance
+ */
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
+{
+ struct ceph_client *client;
+ int err = -ENOMEM;
+
+ client = kzalloc(sizeof(*client), GFP_KERNEL);
+ if (client == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ client->private = private;
+ client->options = opt;
+
+ mutex_init(&client->mount_mutex);
+ init_waitqueue_head(&client->auth_wq);
+ client->auth_err = 0;
+
+ client->extra_mon_dispatch = NULL;
+ client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT;
+ client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT;
+
+ client->msgr = NULL;
+
+ /* subsystems */
+ err = ceph_monc_init(&client->monc, client);
+ if (err < 0)
+ goto fail;
+ err = ceph_osdc_init(&client->osdc, client);
+ if (err < 0)
+ goto fail_monc;
+
+ return client;
+
+fail_monc:
+ ceph_monc_stop(&client->monc);
+fail:
+ kfree(client);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(ceph_create_client);
+
+void ceph_destroy_client(struct ceph_client *client)
+{
+ dout("destroy_client %p\n", client);
+
+ /* unmount */
+ ceph_osdc_stop(&client->osdc);
+
+ /*
+ * make sure mds and osd connections close out before destroying
+ * the auth module, which is needed to free those connections'
+ * ceph_authorizers.
+ */
+ ceph_msgr_flush();
+
+ ceph_monc_stop(&client->monc);
+
+ ceph_debugfs_client_cleanup(client);
+
+ if (client->msgr)
+ ceph_messenger_destroy(client->msgr);
+
+ ceph_destroy_options(client->options);
+
+ kfree(client);
+ dout("destroy_client %p done\n", client);
+}
+EXPORT_SYMBOL(ceph_destroy_client);
+
+/*
+ * true if we have the mon map (and have thus joined the cluster)
+ */
+static int have_mon_and_osd_map(struct ceph_client *client)
+{
+ return client->monc.monmap && client->monc.monmap->epoch &&
+ client->osdc.osdmap && client->osdc.osdmap->epoch;
+}
+
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+int __ceph_open_session(struct ceph_client *client, unsigned long started)
+{
+ struct ceph_entity_addr *myaddr = NULL;
+ int err;
+ unsigned long timeout = client->options->mount_timeout * HZ;
+
+ /* initialize the messenger */
+ if (client->msgr == NULL) {
+ if (ceph_test_opt(client, MYIP))
+ myaddr = &client->options->my_addr;
+ client->msgr = ceph_messenger_create(myaddr,
+ client->supported_features,
+ client->required_features);
+ if (IS_ERR(client->msgr)) {
+ client->msgr = NULL;
+ return PTR_ERR(client->msgr);
+ }
+ client->msgr->nocrc = ceph_test_opt(client, NOCRC);
+ }
+
+ /* open session, and wait for mon and osd maps */
+ err = ceph_monc_open_session(&client->monc);
+ if (err < 0)
+ return err;
+
+ while (!have_mon_and_osd_map(client)) {
+ err = -EIO;
+ if (timeout && time_after_eq(jiffies, started + timeout))
+ return err;
+
+ /* wait */
+ dout("mount waiting for mon_map\n");
+ err = wait_event_interruptible_timeout(client->auth_wq,
+ have_mon_and_osd_map(client) || (client->auth_err < 0),
+ timeout);
+ if (err == -EINTR || err == -ERESTARTSYS)
+ return err;
+ if (client->auth_err < 0)
+ return client->auth_err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(__ceph_open_session);
+
+
+int ceph_open_session(struct ceph_client *client)
+{
+ int ret;
+ unsigned long started = jiffies; /* note the start time */
+
+ dout("open_session start\n");
+ mutex_lock(&client->mount_mutex);
+
+ ret = __ceph_open_session(client, started);
+
+ mutex_unlock(&client->mount_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_open_session);
+
+
+static int __init init_ceph_lib(void)
+{
+ int ret = 0;
+
+ ret = ceph_debugfs_init();
+ if (ret < 0)
+ goto out;
+
+ ret = ceph_msgr_init();
+ if (ret < 0)
+ goto out_debugfs;
+
+ pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n",
+ CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL,
+ CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
+ CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+
+ return 0;
+
+out_debugfs:
+ ceph_debugfs_cleanup();
+out:
+ return ret;
+}
+
+static void __exit exit_ceph_lib(void)
+{
+ dout("exit_ceph_lib\n");
+ ceph_msgr_exit();
+ ceph_debugfs_cleanup();
+}
+
+module_init(init_ceph_lib);
+module_exit(exit_ceph_lib);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
+MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_LICENSE("GPL");
diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
new file mode 100644
index 00000000000..a3a3a31d3c3
--- /dev/null
+++ b/net/ceph/ceph_fs.c
@@ -0,0 +1,75 @@
+/*
+ * Some non-inline ceph helpers
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+/*
+ * return true if @layout appears to be valid
+ */
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
+{
+ __u32 su = le32_to_cpu(layout->fl_stripe_unit);
+ __u32 sc = le32_to_cpu(layout->fl_stripe_count);
+ __u32 os = le32_to_cpu(layout->fl_object_size);
+
+ /* stripe unit, object size must be non-zero, 64k increment */
+ if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
+ return 0;
+ if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
+ return 0;
+ /* object size must be a multiple of stripe unit */
+ if (os < su || os % su)
+ return 0;
+ /* stripe count must be non-zero */
+ if (!sc)
+ return 0;
+ return 1;
+}
+
+
+int ceph_flags_to_mode(int flags)
+{
+ int mode;
+
+#ifdef O_DIRECTORY /* fixme */
+ if ((flags & O_DIRECTORY) == O_DIRECTORY)
+ return CEPH_FILE_MODE_PIN;
+#endif
+ if ((flags & O_APPEND) == O_APPEND)
+ flags |= O_WRONLY;
+
+ if ((flags & O_ACCMODE) == O_RDWR)
+ mode = CEPH_FILE_MODE_RDWR;
+ else if ((flags & O_ACCMODE) == O_WRONLY)
+ mode = CEPH_FILE_MODE_WR;
+ else
+ mode = CEPH_FILE_MODE_RD;
+
+#ifdef O_LAZY
+ if (flags & O_LAZY)
+ mode |= CEPH_FILE_MODE_LAZY;
+#endif
+
+ return mode;
+}
+EXPORT_SYMBOL(ceph_flags_to_mode);
+
+int ceph_caps_for_mode(int mode)
+{
+ int caps = CEPH_CAP_PIN;
+
+ if (mode & CEPH_FILE_MODE_RD)
+ caps |= CEPH_CAP_FILE_SHARED |
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
+ if (mode & CEPH_FILE_MODE_WR)
+ caps |= CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+ CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+ CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+ if (mode & CEPH_FILE_MODE_LAZY)
+ caps |= CEPH_CAP_FILE_LAZYIO;
+
+ return caps;
+}
+EXPORT_SYMBOL(ceph_caps_for_mode);
diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c
new file mode 100644
index 00000000000..815ef882679
--- /dev/null
+++ b/net/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
+
+#include <linux/ceph/types.h>
+
+/*
+ * Robert Jenkin's hash function.
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * This is in the public domain.
+ */
+#define mix(a, b, c) \
+ do { \
+ a = a - b; a = a - c; a = a ^ (c >> 13); \
+ b = b - c; b = b - a; b = b ^ (a << 8); \
+ c = c - a; c = c - b; c = c ^ (b >> 13); \
+ a = a - b; a = a - c; a = a ^ (c >> 12); \
+ b = b - c; b = b - a; b = b ^ (a << 16); \
+ c = c - a; c = c - b; c = c ^ (b >> 5); \
+ a = a - b; a = a - c; a = a ^ (c >> 3); \
+ b = b - c; b = b - a; b = b ^ (a << 10); \
+ c = c - a; c = c - b; c = c ^ (b >> 15); \
+ } while (0)
+
+unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
+{
+ const unsigned char *k = (const unsigned char *)str;
+ __u32 a, b, c; /* the internal state */
+ __u32 len; /* how many key bytes still need mixing */
+
+ /* Set up the internal state */
+ len = length;
+ a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
+ b = a;
+ c = 0; /* variable initialization of internal state */
+
+ /* handle most of the key */
+ while (len >= 12) {
+ a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
+ ((__u32)k[3] << 24));
+ b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
+ ((__u32)k[7] << 24));
+ c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
+ ((__u32)k[11] << 24));
+ mix(a, b, c);
+ k = k + 12;
+ len = len - 12;
+ }
+
+ /* handle the last 11 bytes */
+ c = c + length;
+ switch (len) { /* all the case statements fall through */
+ case 11:
+ c = c + ((__u32)k[10] << 24);
+ case 10:
+ c = c + ((__u32)k[9] << 16);
+ case 9:
+ c = c + ((__u32)k[8] << 8);
+ /* the first byte of c is reserved for the length */
+ case 8:
+ b = b + ((__u32)k[7] << 24);
+ case 7:
+ b = b + ((__u32)k[6] << 16);
+ case 6:
+ b = b + ((__u32)k[5] << 8);
+ case 5:
+ b = b + k[4];
+ case 4:
+ a = a + ((__u32)k[3] << 24);
+ case 3:
+ a = a + ((__u32)k[2] << 16);
+ case 2:
+ a = a + ((__u32)k[1] << 8);
+ case 1:
+ a = a + k[0];
+ /* case 0: nothing left to add */
+ }
+ mix(a, b, c);
+
+ return c;
+}
+
+/*
+ * linux dcache hash
+ */
+unsigned ceph_str_hash_linux(const char *str, unsigned length)
+{
+ unsigned long hash = 0;
+ unsigned char c;
+
+ while (length--) {
+ c = *str++;
+ hash = (hash + (c << 4) + (c >> 4)) * 11;
+ }
+ return hash;
+}
+
+
+unsigned ceph_str_hash(int type, const char *s, unsigned len)
+{
+ switch (type) {
+ case CEPH_STR_HASH_LINUX:
+ return ceph_str_hash_linux(s, len);
+ case CEPH_STR_HASH_RJENKINS:
+ return ceph_str_hash_rjenkins(s, len);
+ default:
+ return -1;
+ }
+}
+
+const char *ceph_str_hash_name(int type)
+{
+ switch (type) {
+ case CEPH_STR_HASH_LINUX:
+ return "linux";
+ case CEPH_STR_HASH_RJENKINS:
+ return "rjenkins";
+ default:
+ return "unknown";
+ }
+}
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
new file mode 100644
index 00000000000..3fbda04de29
--- /dev/null
+++ b/net/ceph/ceph_strings.c
@@ -0,0 +1,84 @@
+/*
+ * Ceph string constants
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+const char *ceph_entity_type_name(int type)
+{
+ switch (type) {
+ case CEPH_ENTITY_TYPE_MDS: return "mds";
+ case CEPH_ENTITY_TYPE_OSD: return "osd";
+ case CEPH_ENTITY_TYPE_MON: return "mon";
+ case CEPH_ENTITY_TYPE_CLIENT: return "client";
+ case CEPH_ENTITY_TYPE_AUTH: return "auth";
+ default: return "unknown";
+ }
+}
+
+const char *ceph_osd_op_name(int op)
+{
+ switch (op) {
+ case CEPH_OSD_OP_READ: return "read";
+ case CEPH_OSD_OP_STAT: return "stat";
+
+ case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
+
+ case CEPH_OSD_OP_WRITE: return "write";
+ case CEPH_OSD_OP_DELETE: return "delete";
+ case CEPH_OSD_OP_TRUNCATE: return "truncate";
+ case CEPH_OSD_OP_ZERO: return "zero";
+ case CEPH_OSD_OP_WRITEFULL: return "writefull";
+ case CEPH_OSD_OP_ROLLBACK: return "rollback";
+
+ case CEPH_OSD_OP_APPEND: return "append";
+ case CEPH_OSD_OP_STARTSYNC: return "startsync";
+ case CEPH_OSD_OP_SETTRUNC: return "settrunc";
+ case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
+
+ case CEPH_OSD_OP_TMAPUP: return "tmapup";
+ case CEPH_OSD_OP_TMAPGET: return "tmapget";
+ case CEPH_OSD_OP_TMAPPUT: return "tmapput";
+
+ case CEPH_OSD_OP_GETXATTR: return "getxattr";
+ case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
+ case CEPH_OSD_OP_SETXATTR: return "setxattr";
+ case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
+ case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
+ case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+ case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
+
+ case CEPH_OSD_OP_PULL: return "pull";
+ case CEPH_OSD_OP_PUSH: return "push";
+ case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
+ case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
+ case CEPH_OSD_OP_SCRUB: return "scrub";
+
+ case CEPH_OSD_OP_WRLOCK: return "wrlock";
+ case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
+ case CEPH_OSD_OP_RDLOCK: return "rdlock";
+ case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
+ case CEPH_OSD_OP_UPLOCK: return "uplock";
+ case CEPH_OSD_OP_DNLOCK: return "dnlock";
+
+ case CEPH_OSD_OP_CALL: return "call";
+
+ case CEPH_OSD_OP_PGLS: return "pgls";
+ }
+ return "???";
+}
+
+
+const char *ceph_pool_op_name(int op)
+{
+ switch (op) {
+ case POOL_OP_CREATE: return "create";
+ case POOL_OP_DELETE: return "delete";
+ case POOL_OP_AUID_CHANGE: return "auid change";
+ case POOL_OP_CREATE_SNAP: return "create snap";
+ case POOL_OP_DELETE_SNAP: return "delete snap";
+ case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
+ case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
+ }
+ return "???";
+}
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
new file mode 100644
index 00000000000..d6ebb13a18a
--- /dev/null
+++ b/net/ceph/crush/crush.c
@@ -0,0 +1,151 @@
+
+#ifdef __KERNEL__
+# include <linux/slab.h>
+#else
+# include <stdlib.h>
+# include <assert.h>
+# define kfree(x) do { if (x) free(x); } while (0)
+# define BUG_ON(x) assert(!(x))
+#endif
+
+#include <linux/crush/crush.h>
+
+const char *crush_bucket_alg_name(int alg)
+{
+ switch (alg) {
+ case CRUSH_BUCKET_UNIFORM: return "uniform";
+ case CRUSH_BUCKET_LIST: return "list";
+ case CRUSH_BUCKET_TREE: return "tree";
+ case CRUSH_BUCKET_STRAW: return "straw";
+ default: return "unknown";
+ }
+}
+
+/**
+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
+ * @b: bucket pointer
+ * @p: item index in bucket
+ */
+int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
+{
+ if (p >= b->size)
+ return 0;
+
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ return ((struct crush_bucket_uniform *)b)->item_weight;
+ case CRUSH_BUCKET_LIST:
+ return ((struct crush_bucket_list *)b)->item_weights[p];
+ case CRUSH_BUCKET_TREE:
+ if (p & 1)
+ return ((struct crush_bucket_tree *)b)->node_weights[p];
+ return 0;
+ case CRUSH_BUCKET_STRAW:
+ return ((struct crush_bucket_straw *)b)->item_weights[p];
+ }
+ return 0;
+}
+
+/**
+ * crush_calc_parents - Calculate parent vectors for the given crush map.
+ * @map: crush_map pointer
+ */
+void crush_calc_parents(struct crush_map *map)
+{
+ int i, b, c;
+
+ for (b = 0; b < map->max_buckets; b++) {
+ if (map->buckets[b] == NULL)
+ continue;
+ for (i = 0; i < map->buckets[b]->size; i++) {
+ c = map->buckets[b]->items[i];
+ BUG_ON(c >= map->max_devices ||
+ c < -map->max_buckets);
+ if (c >= 0)
+ map->device_parents[c] = map->buckets[b]->id;
+ else
+ map->bucket_parents[-1-c] = map->buckets[b]->id;
+ }
+ }
+}
+
+void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
+{
+ kfree(b->h.perm);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket_list(struct crush_bucket_list *b)
+{
+ kfree(b->item_weights);
+ kfree(b->sum_weights);
+ kfree(b->h.perm);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
+{
+ kfree(b->node_weights);
+ kfree(b);
+}
+
+void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
+{
+ kfree(b->straws);
+ kfree(b->item_weights);
+ kfree(b->h.perm);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket(struct crush_bucket *b)
+{
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
+ break;
+ case CRUSH_BUCKET_LIST:
+ crush_destroy_bucket_list((struct crush_bucket_list *)b);
+ break;
+ case CRUSH_BUCKET_TREE:
+ crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
+ break;
+ case CRUSH_BUCKET_STRAW:
+ crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
+ break;
+ }
+}
+
+/**
+ * crush_destroy - Destroy a crush_map
+ * @map: crush_map pointer
+ */
+void crush_destroy(struct crush_map *map)
+{
+ int b;
+
+ /* buckets */
+ if (map->buckets) {
+ for (b = 0; b < map->max_buckets; b++) {
+ if (map->buckets[b] == NULL)
+ continue;
+ crush_destroy_bucket(map->buckets[b]);
+ }
+ kfree(map->buckets);
+ }
+
+ /* rules */
+ if (map->rules) {
+ for (b = 0; b < map->max_rules; b++)
+ kfree(map->rules[b]);
+ kfree(map->rules);
+ }
+
+ kfree(map->bucket_parents);
+ kfree(map->device_parents);
+ kfree(map);
+}
+
+
diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c
new file mode 100644
index 00000000000..5bb63e37a8a
--- /dev/null
+++ b/net/ceph/crush/hash.c
@@ -0,0 +1,149 @@
+
+#include <linux/types.h>
+#include <linux/crush/hash.h>
+
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ */
+#define crush_hashmix(a, b, c) do { \
+ a = a-b; a = a-c; a = a^(c>>13); \
+ b = b-c; b = b-a; b = b^(a<<8); \
+ c = c-a; c = c-b; c = c^(b>>13); \
+ a = a-b; a = a-c; a = a^(c>>12); \
+ b = b-c; b = b-a; b = b^(a<<16); \
+ c = c-a; c = c-b; c = c^(b>>5); \
+ a = a-b; a = a-c; a = a^(c>>3); \
+ b = b-c; b = b-a; b = b^(a<<10); \
+ c = c-a; c = c-b; c = c^(b>>15); \
+ } while (0)
+
+#define crush_hash_seed 1315423911
+
+static __u32 crush_hash32_rjenkins1(__u32 a)
+{
+ __u32 hash = crush_hash_seed ^ a;
+ __u32 b = a;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(b, x, hash);
+ crush_hashmix(y, a, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(x, a, hash);
+ crush_hashmix(b, y, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b ^ c;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(c, x, hash);
+ crush_hashmix(y, a, hash);
+ crush_hashmix(b, x, hash);
+ crush_hashmix(y, c, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(c, d, hash);
+ crush_hashmix(a, x, hash);
+ crush_hashmix(y, b, hash);
+ crush_hashmix(c, x, hash);
+ crush_hashmix(y, d, hash);
+ return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
+ __u32 e)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(c, d, hash);
+ crush_hashmix(e, x, hash);
+ crush_hashmix(y, a, hash);
+ crush_hashmix(b, x, hash);
+ crush_hashmix(y, c, hash);
+ crush_hashmix(d, x, hash);
+ crush_hashmix(y, e, hash);
+ return hash;
+}
+
+
+__u32 crush_hash32(int type, __u32 a)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1(a);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_2(int type, __u32 a, __u32 b)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_2(a, b);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_3(a, b, c);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_4(a, b, c, d);
+ default:
+ return 0;
+ }
+}
+
+__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_5(a, b, c, d, e);
+ default:
+ return 0;
+ }
+}
+
+const char *crush_hash_name(int type)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return "rjenkins1";
+ default:
+ return "unknown";
+ }
+}
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
new file mode 100644
index 00000000000..42599e31dca
--- /dev/null
+++ b/net/ceph/crush/mapper.c
@@ -0,0 +1,609 @@
+
+#ifdef __KERNEL__
+# include <linux/string.h>
+# include <linux/slab.h>
+# include <linux/bug.h>
+# include <linux/kernel.h>
+# ifndef dprintk
+# define dprintk(args...)
+# endif
+#else
+# include <string.h>
+# include <stdio.h>
+# include <stdlib.h>
+# include <assert.h>
+# define BUG_ON(x) assert(!(x))
+# define dprintk(args...) /* printf(args) */
+# define kmalloc(x, f) malloc(x)
+# define kfree(x) free(x)
+#endif
+
+#include <linux/crush/crush.h>
+#include <linux/crush/hash.h>
+
+/*
+ * Implement the core CRUSH mapping algorithm.
+ */
+
+/**
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
+ * @map: the crush_map
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
+ * @size: output set size
+ */
+int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
+{
+ int i;
+
+ for (i = 0; i < map->max_rules; i++) {
+ if (map->rules[i] &&
+ map->rules[i]->mask.ruleset == ruleset &&
+ map->rules[i]->mask.type == type &&
+ map->rules[i]->mask.min_size <= size &&
+ map->rules[i]->mask.max_size >= size)
+ return i;
+ }
+ return -1;
+}
+
+
+/*
+ * bucket choose methods
+ *
+ * For each bucket algorithm, we have a "choose" method that, given a
+ * crush input @x and replica position (usually, position in output set) @r,
+ * will produce an item in the bucket.
+ */
+
+/*
+ * Choose based on a random permutation of the bucket.
+ *
+ * We used to use some prime number arithmetic to do this, but it
+ * wasn't very random, and had some other bad behaviors. Instead, we
+ * calculate an actual random permutation of the bucket members.
+ * Since this is expensive, we optimize for the r=0 case, which
+ * captures the vast majority of calls.
+ */
+static int bucket_perm_choose(struct crush_bucket *bucket,
+ int x, int r)
+{
+ unsigned pr = r % bucket->size;
+ unsigned i, s;
+
+ /* start a new permutation if @x has changed */
+ if (bucket->perm_x != x || bucket->perm_n == 0) {
+ dprintk("bucket %d new x=%d\n", bucket->id, x);
+ bucket->perm_x = x;
+
+ /* optimize common r=0 case */
+ if (pr == 0) {
+ s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
+ bucket->size;
+ bucket->perm[0] = s;
+ bucket->perm_n = 0xffff; /* magic value, see below */
+ goto out;
+ }
+
+ for (i = 0; i < bucket->size; i++)
+ bucket->perm[i] = i;
+ bucket->perm_n = 0;
+ } else if (bucket->perm_n == 0xffff) {
+ /* clean up after the r=0 case above */
+ for (i = 1; i < bucket->size; i++)
+ bucket->perm[i] = i;
+ bucket->perm[bucket->perm[0]] = 0;
+ bucket->perm_n = 1;
+ }
+
+ /* calculate permutation up to pr */
+ for (i = 0; i < bucket->perm_n; i++)
+ dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
+ while (bucket->perm_n <= pr) {
+ unsigned p = bucket->perm_n;
+ /* no point in swapping the final entry */
+ if (p < bucket->size - 1) {
+ i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
+ (bucket->size - p);
+ if (i) {
+ unsigned t = bucket->perm[p + i];
+ bucket->perm[p + i] = bucket->perm[p];
+ bucket->perm[p] = t;
+ }
+ dprintk(" perm_choose swap %d with %d\n", p, p+i);
+ }
+ bucket->perm_n++;
+ }
+ for (i = 0; i < bucket->size; i++)
+ dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
+
+ s = bucket->perm[pr];
+out:
+ dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
+ bucket->size, x, r, pr, s);
+ return bucket->items[s];
+}
+
+/* uniform */
+static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
+ int x, int r)
+{
+ return bucket_perm_choose(&bucket->h, x, r);
+}
+
+/* list */
+static int bucket_list_choose(struct crush_bucket_list *bucket,
+ int x, int r)
+{
+ int i;
+
+ for (i = bucket->h.size-1; i >= 0; i--) {
+ __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+ r, bucket->h.id);
+ w &= 0xffff;
+ dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
+ "sw %x rand %llx",
+ i, x, r, bucket->h.items[i], bucket->item_weights[i],
+ bucket->sum_weights[i], w);
+ w *= bucket->sum_weights[i];
+ w = w >> 16;
+ /*dprintk(" scaled %llx\n", w);*/
+ if (w < bucket->item_weights[i])
+ return bucket->h.items[i];
+ }
+
+ BUG_ON(1);
+ return 0;
+}
+
+
+/* (binary) tree */
+static int height(int n)
+{
+ int h = 0;
+ while ((n & 1) == 0) {
+ h++;
+ n = n >> 1;
+ }
+ return h;
+}
+
+static int left(int x)
+{
+ int h = height(x);
+ return x - (1 << (h-1));
+}
+
+static int right(int x)
+{
+ int h = height(x);
+ return x + (1 << (h-1));
+}
+
+static int terminal(int x)
+{
+ return x & 1;
+}
+
+static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+ int x, int r)
+{
+ int n, l;
+ __u32 w;
+ __u64 t;
+
+ /* start at root */
+ n = bucket->num_nodes >> 1;
+
+ while (!terminal(n)) {
+ /* pick point in [0, w) */
+ w = bucket->node_weights[n];
+ t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
+ bucket->h.id) * (__u64)w;
+ t = t >> 32;
+
+ /* descend to the left or right? */
+ l = left(n);
+ if (t < bucket->node_weights[l])
+ n = l;
+ else
+ n = right(n);
+ }
+
+ return bucket->h.items[n >> 1];
+}
+
+
+/* straw */
+
+static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+ int x, int r)
+{
+ int i;
+ int high = 0;
+ __u64 high_draw = 0;
+ __u64 draw;
+
+ for (i = 0; i < bucket->h.size; i++) {
+ draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
+ draw &= 0xffff;
+ draw *= bucket->straws[i];
+ if (i == 0 || draw > high_draw) {
+ high = i;
+ high_draw = draw;
+ }
+ }
+ return bucket->h.items[high];
+}
+
+static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+{
+ dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
+ switch (in->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ return bucket_uniform_choose((struct crush_bucket_uniform *)in,
+ x, r);
+ case CRUSH_BUCKET_LIST:
+ return bucket_list_choose((struct crush_bucket_list *)in,
+ x, r);
+ case CRUSH_BUCKET_TREE:
+ return bucket_tree_choose((struct crush_bucket_tree *)in,
+ x, r);
+ case CRUSH_BUCKET_STRAW:
+ return bucket_straw_choose((struct crush_bucket_straw *)in,
+ x, r);
+ default:
+ BUG_ON(1);
+ return in->items[0];
+ }
+}
+
+/*
+ * true if device is marked "out" (failed, fully offloaded)
+ * of the cluster
+ */
+static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
+{
+ if (weight[item] >= 0x10000)
+ return 0;
+ if (weight[item] == 0)
+ return 1;
+ if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
+ < weight[item])
+ return 0;
+ return 1;
+}
+
+/**
+ * crush_choose - choose numrep distinct items of given type
+ * @map: the crush_map
+ * @bucket: the bucket we are choose an item from
+ * @x: crush input value
+ * @numrep: the number of items to choose
+ * @type: the type of item to choose
+ * @out: pointer to output vector
+ * @outpos: our position in that vector
+ * @firstn: true if choosing "first n" items, false if choosing "indep"
+ * @recurse_to_leaf: true if we want one device under each item of given type
+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ */
+static int crush_choose(struct crush_map *map,
+ struct crush_bucket *bucket,
+ __u32 *weight,
+ int x, int numrep, int type,
+ int *out, int outpos,
+ int firstn, int recurse_to_leaf,
+ int *out2)
+{
+ int rep;
+ int ftotal, flocal;
+ int retry_descent, retry_bucket, skip_rep;
+ struct crush_bucket *in = bucket;
+ int r;
+ int i;
+ int item = 0;
+ int itemtype;
+ int collide, reject;
+ const int orig_tries = 5; /* attempts before we fall back to search */
+
+ dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+ bucket->id, x, outpos, numrep);
+
+ for (rep = outpos; rep < numrep; rep++) {
+ /* keep trying until we get a non-out, non-colliding item */
+ ftotal = 0;
+ skip_rep = 0;
+ do {
+ retry_descent = 0;
+ in = bucket; /* initial bucket */
+
+ /* choose through intervening buckets */
+ flocal = 0;
+ do {
+ collide = 0;
+ retry_bucket = 0;
+ r = rep;
+ if (in->alg == CRUSH_BUCKET_UNIFORM) {
+ /* be careful */
+ if (firstn || numrep >= in->size)
+ /* r' = r + f_total */
+ r += ftotal;
+ else if (in->size % numrep == 0)
+ /* r'=r+(n+1)*f_local */
+ r += (numrep+1) *
+ (flocal+ftotal);
+ else
+ /* r' = r + n*f_local */
+ r += numrep * (flocal+ftotal);
+ } else {
+ if (firstn)
+ /* r' = r + f_total */
+ r += ftotal;
+ else
+ /* r' = r + n*f_local */
+ r += numrep * (flocal+ftotal);
+ }
+
+ /* bucket choose */
+ if (in->size == 0) {
+ reject = 1;
+ goto reject;
+ }
+ if (flocal >= (in->size>>1) &&
+ flocal > orig_tries)
+ item = bucket_perm_choose(in, x, r);
+ else
+ item = crush_bucket_choose(in, x, r);
+ BUG_ON(item >= map->max_devices);
+
+ /* desired type? */
+ if (item < 0)
+ itemtype = map->buckets[-1-item]->type;
+ else
+ itemtype = 0;
+ dprintk(" item %d type %d\n", item, itemtype);
+
+ /* keep going? */
+ if (itemtype != type) {
+ BUG_ON(item >= 0 ||
+ (-1-item) >= map->max_buckets);
+ in = map->buckets[-1-item];
+ retry_bucket = 1;
+ continue;
+ }
+
+ /* collision? */
+ for (i = 0; i < outpos; i++) {
+ if (out[i] == item) {
+ collide = 1;
+ break;
+ }
+ }
+
+ reject = 0;
+ if (recurse_to_leaf) {
+ if (item < 0) {
+ if (crush_choose(map,
+ map->buckets[-1-item],
+ weight,
+ x, outpos+1, 0,
+ out2, outpos,
+ firstn, 0,
+ NULL) <= outpos)
+ /* didn't get leaf */
+ reject = 1;
+ } else {
+ /* we already have a leaf! */
+ out2[outpos] = item;
+ }
+ }
+
+ if (!reject) {
+ /* out? */
+ if (itemtype == 0)
+ reject = is_out(map, weight,
+ item, x);
+ else
+ reject = 0;
+ }
+
+reject:
+ if (reject || collide) {
+ ftotal++;
+ flocal++;
+
+ if (collide && flocal < 3)
+ /* retry locally a few times */
+ retry_bucket = 1;
+ else if (flocal < in->size + orig_tries)
+ /* exhaustive bucket search */
+ retry_bucket = 1;
+ else if (ftotal < 20)
+ /* then retry descent */
+ retry_descent = 1;
+ else
+ /* else give up */
+ skip_rep = 1;
+ dprintk(" reject %d collide %d "
+ "ftotal %d flocal %d\n",
+ reject, collide, ftotal,
+ flocal);
+ }
+ } while (retry_bucket);
+ } while (retry_descent);
+
+ if (skip_rep) {
+ dprintk("skip rep\n");
+ continue;
+ }
+
+ dprintk("CHOOSE got %d\n", item);
+ out[outpos] = item;
+ outpos++;
+ }
+
+ dprintk("CHOOSE returns %d\n", outpos);
+ return outpos;
+}
+
+
+/**
+ * crush_do_rule - calculate a mapping with the given input and rule
+ * @map: the crush_map
+ * @ruleno: the rule id
+ * @x: hash input
+ * @result: pointer to result vector
+ * @result_max: maximum result size
+ * @force: force initial replica choice; -1 for none
+ */
+int crush_do_rule(struct crush_map *map,
+ int ruleno, int x, int *result, int result_max,
+ int force, __u32 *weight)
+{
+ int result_len;
+ int force_context[CRUSH_MAX_DEPTH];
+ int force_pos = -1;
+ int a[CRUSH_MAX_SET];
+ int b[CRUSH_MAX_SET];
+ int c[CRUSH_MAX_SET];
+ int recurse_to_leaf;
+ int *w;
+ int wsize = 0;
+ int *o;
+ int osize;
+ int *tmp;
+ struct crush_rule *rule;
+ int step;
+ int i, j;
+ int numrep;
+ int firstn;
+ int rc = -1;
+
+ BUG_ON(ruleno >= map->max_rules);
+
+ rule = map->rules[ruleno];
+ result_len = 0;
+ w = a;
+ o = b;
+
+ /*
+ * determine hierarchical context of force, if any. note
+ * that this may or may not correspond to the specific types
+ * referenced by the crush rule.
+ */
+ if (force >= 0) {
+ if (force >= map->max_devices ||
+ map->device_parents[force] == 0) {
+ /*dprintk("CRUSH: forcefed device dne\n");*/
+ rc = -1; /* force fed device dne */
+ goto out;
+ }
+ if (!is_out(map, weight, force, x)) {
+ while (1) {
+ force_context[++force_pos] = force;
+ if (force >= 0)
+ force = map->device_parents[force];
+ else
+ force = map->bucket_parents[-1-force];
+ if (force == 0)
+ break;
+ }
+ }
+ }
+
+ for (step = 0; step < rule->len; step++) {
+ firstn = 0;
+ switch (rule->steps[step].op) {
+ case CRUSH_RULE_TAKE:
+ w[0] = rule->steps[step].arg1;
+ if (force_pos >= 0) {
+ BUG_ON(force_context[force_pos] != w[0]);
+ force_pos--;
+ }
+ wsize = 1;
+ break;
+
+ case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
+ case CRUSH_RULE_CHOOSE_FIRSTN:
+ firstn = 1;
+ case CRUSH_RULE_CHOOSE_LEAF_INDEP:
+ case CRUSH_RULE_CHOOSE_INDEP:
+ BUG_ON(wsize == 0);
+
+ recurse_to_leaf =
+ rule->steps[step].op ==
+ CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
+ rule->steps[step].op ==
+ CRUSH_RULE_CHOOSE_LEAF_INDEP;
+
+ /* reset output */
+ osize = 0;
+
+ for (i = 0; i < wsize; i++) {
+ /*
+ * see CRUSH_N, CRUSH_N_MINUS macros.
+ * basically, numrep <= 0 means relative to
+ * the provided result_max
+ */
+ numrep = rule->steps[step].arg1;
+ if (numrep <= 0) {
+ numrep += result_max;
+ if (numrep <= 0)
+ continue;
+ }
+ j = 0;
+ if (osize == 0 && force_pos >= 0) {
+ /* skip any intermediate types */
+ while (force_pos &&
+ force_context[force_pos] < 0 &&
+ rule->steps[step].arg2 !=
+ map->buckets[-1 -
+ force_context[force_pos]]->type)
+ force_pos--;
+ o[osize] = force_context[force_pos];
+ if (recurse_to_leaf)
+ c[osize] = force_context[0];
+ j++;
+ force_pos--;
+ }
+ osize += crush_choose(map,
+ map->buckets[-1-w[i]],
+ weight,
+ x, numrep,
+ rule->steps[step].arg2,
+ o+osize, j,
+ firstn,
+ recurse_to_leaf, c+osize);
+ }
+
+ if (recurse_to_leaf)
+ /* copy final _leaf_ values to output set */
+ memcpy(o, c, osize*sizeof(*o));
+
+ /* swap t and w arrays */
+ tmp = o;
+ o = w;
+ w = tmp;
+ wsize = osize;
+ break;
+
+
+ case CRUSH_RULE_EMIT:
+ for (i = 0; i < wsize && result_len < result_max; i++) {
+ result[result_len] = w[i];
+ result_len++;
+ }
+ wsize = 0;
+ break;
+
+ default:
+ BUG_ON(1);
+ }
+ }
+ rc = result_len;
+
+out:
+ return rc;
+}
+
+
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
new file mode 100644
index 00000000000..7b505b0c983
--- /dev/null
+++ b/net/ceph/crypto.c
@@ -0,0 +1,412 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <crypto/hash.h>
+
+#include <linux/ceph/decode.h>
+#include "crypto.h"
+
+int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
+{
+ if (*p + sizeof(u16) + sizeof(key->created) +
+ sizeof(u16) + key->len > end)
+ return -ERANGE;
+ ceph_encode_16(p, key->type);
+ ceph_encode_copy(p, &key->created, sizeof(key->created));
+ ceph_encode_16(p, key->len);
+ ceph_encode_copy(p, key->key, key->len);
+ return 0;
+}
+
+int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
+{
+ ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
+ key->type = ceph_decode_16(p);
+ ceph_decode_copy(p, &key->created, sizeof(key->created));
+ key->len = ceph_decode_16(p);
+ ceph_decode_need(p, end, key->len, bad);
+ key->key = kmalloc(key->len, GFP_NOFS);
+ if (!key->key)
+ return -ENOMEM;
+ ceph_decode_copy(p, key->key, key->len);
+ return 0;
+
+bad:
+ dout("failed to decode crypto key\n");
+ return -EINVAL;
+}
+
+int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
+{
+ int inlen = strlen(inkey);
+ int blen = inlen * 3 / 4;
+ void *buf, *p;
+ int ret;
+
+ dout("crypto_key_unarmor %s\n", inkey);
+ buf = kmalloc(blen, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
+ blen = ceph_unarmor(buf, inkey, inkey+inlen);
+ if (blen < 0) {
+ kfree(buf);
+ return blen;
+ }
+
+ p = buf;
+ ret = ceph_crypto_key_decode(key, &p, p + blen);
+ kfree(buf);
+ if (ret)
+ return ret;
+ dout("crypto_key_unarmor key %p type %d len %d\n", key,
+ key->type, key->len);
+ return 0;
+}
+
+
+
+#define AES_KEY_SIZE 16
+
+static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
+{
+ return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+}
+
+static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
+
+static int ceph_aes_encrypt(const void *key, int key_len,
+ void *dst, size_t *dst_len,
+ const void *src, size_t src_len)
+{
+ struct scatterlist sg_in[2], sg_out[1];
+ struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+ struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+ int ret;
+ void *iv;
+ int ivsize;
+ size_t zero_padding = (0x10 - (src_len & 0x0f));
+ char pad[16];
+
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ memset(pad, zero_padding, zero_padding);
+
+ *dst_len = src_len + zero_padding;
+
+ crypto_blkcipher_setkey((void *)tfm, key, key_len);
+ sg_init_table(sg_in, 2);
+ sg_set_buf(&sg_in[0], src, src_len);
+ sg_set_buf(&sg_in[1], pad, zero_padding);
+ sg_init_table(sg_out, 1);
+ sg_set_buf(sg_out, dst, *dst_len);
+ iv = crypto_blkcipher_crt(tfm)->iv;
+ ivsize = crypto_blkcipher_ivsize(tfm);
+
+ memcpy(iv, aes_iv, ivsize);
+ /*
+ print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
+ key, key_len, 1);
+ print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
+ src, src_len, 1);
+ print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
+ pad, zero_padding, 1);
+ */
+ ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+ src_len + zero_padding);
+ crypto_free_blkcipher(tfm);
+ if (ret < 0)
+ pr_err("ceph_aes_crypt failed %d\n", ret);
+ /*
+ print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
+ dst, *dst_len, 1);
+ */
+ return 0;
+}
+
+static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
+ size_t *dst_len,
+ const void *src1, size_t src1_len,
+ const void *src2, size_t src2_len)
+{
+ struct scatterlist sg_in[3], sg_out[1];
+ struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+ struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+ int ret;
+ void *iv;
+ int ivsize;
+ size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
+ char pad[16];
+
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ memset(pad, zero_padding, zero_padding);
+
+ *dst_len = src1_len + src2_len + zero_padding;
+
+ crypto_blkcipher_setkey((void *)tfm, key, key_len);
+ sg_init_table(sg_in, 3);
+ sg_set_buf(&sg_in[0], src1, src1_len);
+ sg_set_buf(&sg_in[1], src2, src2_len);
+ sg_set_buf(&sg_in[2], pad, zero_padding);
+ sg_init_table(sg_out, 1);
+ sg_set_buf(sg_out, dst, *dst_len);
+ iv = crypto_blkcipher_crt(tfm)->iv;
+ ivsize = crypto_blkcipher_ivsize(tfm);
+
+ memcpy(iv, aes_iv, ivsize);
+ /*
+ print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
+ key, key_len, 1);
+ print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
+ src1, src1_len, 1);
+ print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
+ src2, src2_len, 1);
+ print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
+ pad, zero_padding, 1);
+ */
+ ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+ src1_len + src2_len + zero_padding);
+ crypto_free_blkcipher(tfm);
+ if (ret < 0)
+ pr_err("ceph_aes_crypt2 failed %d\n", ret);
+ /*
+ print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
+ dst, *dst_len, 1);
+ */
+ return 0;
+}
+
+static int ceph_aes_decrypt(const void *key, int key_len,
+ void *dst, size_t *dst_len,
+ const void *src, size_t src_len)
+{
+ struct scatterlist sg_in[1], sg_out[2];
+ struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+ struct blkcipher_desc desc = { .tfm = tfm };
+ char pad[16];
+ void *iv;
+ int ivsize;
+ int ret;
+ int last_byte;
+
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ crypto_blkcipher_setkey((void *)tfm, key, key_len);
+ sg_init_table(sg_in, 1);
+ sg_init_table(sg_out, 2);
+ sg_set_buf(sg_in, src, src_len);
+ sg_set_buf(&sg_out[0], dst, *dst_len);
+ sg_set_buf(&sg_out[1], pad, sizeof(pad));
+
+ iv = crypto_blkcipher_crt(tfm)->iv;
+ ivsize = crypto_blkcipher_ivsize(tfm);
+
+ memcpy(iv, aes_iv, ivsize);
+
+ /*
+ print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
+ key, key_len, 1);
+ print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
+ src, src_len, 1);
+ */
+
+ ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+ crypto_free_blkcipher(tfm);
+ if (ret < 0) {
+ pr_err("ceph_aes_decrypt failed %d\n", ret);
+ return ret;
+ }
+
+ if (src_len <= *dst_len)
+ last_byte = ((char *)dst)[src_len - 1];
+ else
+ last_byte = pad[src_len - *dst_len - 1];
+ if (last_byte <= 16 && src_len >= last_byte) {
+ *dst_len = src_len - last_byte;
+ } else {
+ pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+ last_byte, (int)src_len);
+ return -EPERM; /* bad padding */
+ }
+ /*
+ print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
+ dst, *dst_len, 1);
+ */
+ return 0;
+}
+
+static int ceph_aes_decrypt2(const void *key, int key_len,
+ void *dst1, size_t *dst1_len,
+ void *dst2, size_t *dst2_len,
+ const void *src, size_t src_len)
+{
+ struct scatterlist sg_in[1], sg_out[3];
+ struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+ struct blkcipher_desc desc = { .tfm = tfm };
+ char pad[16];
+ void *iv;
+ int ivsize;
+ int ret;
+ int last_byte;
+
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ sg_init_table(sg_in, 1);
+ sg_set_buf(sg_in, src, src_len);
+ sg_init_table(sg_out, 3);
+ sg_set_buf(&sg_out[0], dst1, *dst1_len);
+ sg_set_buf(&sg_out[1], dst2, *dst2_len);
+ sg_set_buf(&sg_out[2], pad, sizeof(pad));
+
+ crypto_blkcipher_setkey((void *)tfm, key, key_len);
+ iv = crypto_blkcipher_crt(tfm)->iv;
+ ivsize = crypto_blkcipher_ivsize(tfm);
+
+ memcpy(iv, aes_iv, ivsize);
+
+ /*
+ print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
+ key, key_len, 1);
+ print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
+ src, src_len, 1);
+ */
+
+ ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+ crypto_free_blkcipher(tfm);
+ if (ret < 0) {
+ pr_err("ceph_aes_decrypt failed %d\n", ret);
+ return ret;
+ }
+
+ if (src_len <= *dst1_len)
+ last_byte = ((char *)dst1)[src_len - 1];
+ else if (src_len <= *dst1_len + *dst2_len)
+ last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
+ else
+ last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
+ if (last_byte <= 16 && src_len >= last_byte) {
+ src_len -= last_byte;
+ } else {
+ pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+ last_byte, (int)src_len);
+ return -EPERM; /* bad padding */
+ }
+
+ if (src_len < *dst1_len) {
+ *dst1_len = src_len;
+ *dst2_len = 0;
+ } else {
+ *dst2_len = src_len - *dst1_len;
+ }
+ /*
+ print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
+ dst1, *dst1_len, 1);
+ print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
+ dst2, *dst2_len, 1);
+ */
+
+ return 0;
+}
+
+
+int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+ const void *src, size_t src_len)
+{
+ switch (secret->type) {
+ case CEPH_CRYPTO_NONE:
+ if (*dst_len < src_len)
+ return -ERANGE;
+ memcpy(dst, src, src_len);
+ *dst_len = src_len;
+ return 0;
+
+ case CEPH_CRYPTO_AES:
+ return ceph_aes_decrypt(secret->key, secret->len, dst,
+ dst_len, src, src_len);
+
+ default:
+ return -EINVAL;
+ }
+}
+
+int ceph_decrypt2(struct ceph_crypto_key *secret,
+ void *dst1, size_t *dst1_len,
+ void *dst2, size_t *dst2_len,
+ const void *src, size_t src_len)
+{
+ size_t t;
+
+ switch (secret->type) {
+ case CEPH_CRYPTO_NONE:
+ if (*dst1_len + *dst2_len < src_len)
+ return -ERANGE;
+ t = min(*dst1_len, src_len);
+ memcpy(dst1, src, t);
+ *dst1_len = t;
+ src += t;
+ src_len -= t;
+ if (src_len) {
+ t = min(*dst2_len, src_len);
+ memcpy(dst2, src, t);
+ *dst2_len = t;
+ }
+ return 0;
+
+ case CEPH_CRYPTO_AES:
+ return ceph_aes_decrypt2(secret->key, secret->len,
+ dst1, dst1_len, dst2, dst2_len,
+ src, src_len);
+
+ default:
+ return -EINVAL;
+ }
+}
+
+int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+ const void *src, size_t src_len)
+{
+ switch (secret->type) {
+ case CEPH_CRYPTO_NONE:
+ if (*dst_len < src_len)
+ return -ERANGE;
+ memcpy(dst, src, src_len);
+ *dst_len = src_len;
+ return 0;
+
+ case CEPH_CRYPTO_AES:
+ return ceph_aes_encrypt(secret->key, secret->len, dst,
+ dst_len, src, src_len);
+
+ default:
+ return -EINVAL;
+ }
+}
+
+int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+ const void *src1, size_t src1_len,
+ const void *src2, size_t src2_len)
+{
+ switch (secret->type) {
+ case CEPH_CRYPTO_NONE:
+ if (*dst_len < src1_len + src2_len)
+ return -ERANGE;
+ memcpy(dst, src1, src1_len);
+ memcpy(dst + src1_len, src2, src2_len);
+ *dst_len = src1_len + src2_len;
+ return 0;
+
+ case CEPH_CRYPTO_AES:
+ return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
+ src1, src1_len, src2, src2_len);
+
+ default:
+ return -EINVAL;
+ }
+}
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
new file mode 100644
index 00000000000..f9eccace592
--- /dev/null
+++ b/net/ceph/crypto.h
@@ -0,0 +1,48 @@
+#ifndef _FS_CEPH_CRYPTO_H
+#define _FS_CEPH_CRYPTO_H
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/buffer.h>
+
+/*
+ * cryptographic secret
+ */
+struct ceph_crypto_key {
+ int type;
+ struct ceph_timespec created;
+ int len;
+ void *key;
+};
+
+static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
+{
+ kfree(key->key);
+}
+
+extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
+ void **p, void *end);
+extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
+ void **p, void *end);
+extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
+
+/* crypto.c */
+extern int ceph_decrypt(struct ceph_crypto_key *secret,
+ void *dst, size_t *dst_len,
+ const void *src, size_t src_len);
+extern int ceph_encrypt(struct ceph_crypto_key *secret,
+ void *dst, size_t *dst_len,
+ const void *src, size_t src_len);
+extern int ceph_decrypt2(struct ceph_crypto_key *secret,
+ void *dst1, size_t *dst1_len,
+ void *dst2, size_t *dst2_len,
+ const void *src, size_t src_len);
+extern int ceph_encrypt2(struct ceph_crypto_key *secret,
+ void *dst, size_t *dst_len,
+ const void *src1, size_t src1_len,
+ const void *src2, size_t src2_len);
+
+/* armor.c */
+extern int ceph_armor(char *dst, const char *src, const char *end);
+extern int ceph_unarmor(char *dst, const char *src, const char *end);
+
+#endif
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
new file mode 100644
index 00000000000..27d4ea315d1
--- /dev/null
+++ b/net/ceph/debugfs.c
@@ -0,0 +1,267 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
+#ifdef CONFIG_DEBUG_FS
+
+/*
+ * Implement /sys/kernel/debug/ceph fun
+ *
+ * /sys/kernel/debug/ceph/client* - an instance of the ceph client
+ * .../osdmap - current osdmap
+ * .../monmap - current monmap
+ * .../osdc - active osd requests
+ * .../monc - mon client state
+ * .../dentry_lru - dump contents of dentry lru
+ * .../caps - expose cap (reservation) stats
+ * .../bdi - symlink to ../../bdi/something
+ */
+
+static struct dentry *ceph_debugfs_dir;
+
+static int monmap_show(struct seq_file *s, void *p)
+{
+ int i;
+ struct ceph_client *client = s->private;
+
+ if (client->monc.monmap == NULL)
+ return 0;
+
+ seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
+ for (i = 0; i < client->monc.monmap->num_mon; i++) {
+ struct ceph_entity_inst *inst =
+ &client->monc.monmap->mon_inst[i];
+
+ seq_printf(s, "\t%s%lld\t%s\n",
+ ENTITY_NAME(inst->name),
+ ceph_pr_addr(&inst->addr.in_addr));
+ }
+ return 0;
+}
+
+static int osdmap_show(struct seq_file *s, void *p)
+{
+ int i;
+ struct ceph_client *client = s->private;
+ struct rb_node *n;
+
+ if (client->osdc.osdmap == NULL)
+ return 0;
+ seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
+ seq_printf(s, "flags%s%s\n",
+ (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
+ " NEARFULL" : "",
+ (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
+ " FULL" : "");
+ for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
+ struct ceph_pg_pool_info *pool =
+ rb_entry(n, struct ceph_pg_pool_info, node);
+ seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
+ pool->id, pool->v.pg_num, pool->pg_num_mask,
+ pool->v.lpg_num, pool->lpg_num_mask);
+ }
+ for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
+ struct ceph_entity_addr *addr =
+ &client->osdc.osdmap->osd_addr[i];
+ int state = client->osdc.osdmap->osd_state[i];
+ char sb[64];
+
+ seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
+ i, ceph_pr_addr(&addr->in_addr),
+ ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
+ ceph_osdmap_state_str(sb, sizeof(sb), state));
+ }
+ return 0;
+}
+
+static int monc_show(struct seq_file *s, void *p)
+{
+ struct ceph_client *client = s->private;
+ struct ceph_mon_generic_request *req;
+ struct ceph_mon_client *monc = &client->monc;
+ struct rb_node *rp;
+
+ mutex_lock(&monc->mutex);
+
+ if (monc->have_mdsmap)
+ seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
+ if (monc->have_osdmap)
+ seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
+ if (monc->want_next_osdmap)
+ seq_printf(s, "want next osdmap\n");
+
+ for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
+ __u16 op;
+ req = rb_entry(rp, struct ceph_mon_generic_request, node);
+ op = le16_to_cpu(req->request->hdr.type);
+ if (op == CEPH_MSG_STATFS)
+ seq_printf(s, "%lld statfs\n", req->tid);
+ else
+ seq_printf(s, "%lld unknown\n", req->tid);
+ }
+
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+
+static int osdc_show(struct seq_file *s, void *pp)
+{
+ struct ceph_client *client = s->private;
+ struct ceph_osd_client *osdc = &client->osdc;
+ struct rb_node *p;
+
+ mutex_lock(&osdc->request_mutex);
+ for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+ struct ceph_osd_request *req;
+ struct ceph_osd_request_head *head;
+ struct ceph_osd_op *op;
+ int num_ops;
+ int opcode, olen;
+ int i;
+
+ req = rb_entry(p, struct ceph_osd_request, r_node);
+
+ seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
+ req->r_osd ? req->r_osd->o_osd : -1,
+ le32_to_cpu(req->r_pgid.pool),
+ le16_to_cpu(req->r_pgid.ps));
+
+ head = req->r_request->front.iov_base;
+ op = (void *)(head + 1);
+
+ num_ops = le16_to_cpu(head->num_ops);
+ olen = le32_to_cpu(head->object_len);
+ seq_printf(s, "%.*s", olen,
+ (const char *)(head->ops + num_ops));
+
+ if (req->r_reassert_version.epoch)
+ seq_printf(s, "\t%u'%llu",
+ (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
+ le64_to_cpu(req->r_reassert_version.version));
+ else
+ seq_printf(s, "\t");
+
+ for (i = 0; i < num_ops; i++) {
+ opcode = le16_to_cpu(op->op);
+ seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
+ op++;
+ }
+
+ seq_printf(s, "\n");
+ }
+ mutex_unlock(&osdc->request_mutex);
+ return 0;
+}
+
+CEPH_DEFINE_SHOW_FUNC(monmap_show)
+CEPH_DEFINE_SHOW_FUNC(osdmap_show)
+CEPH_DEFINE_SHOW_FUNC(monc_show)
+CEPH_DEFINE_SHOW_FUNC(osdc_show)
+
+int ceph_debugfs_init(void)
+{
+ ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
+ if (!ceph_debugfs_dir)
+ return -ENOMEM;
+ return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+ debugfs_remove(ceph_debugfs_dir);
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+ int ret = -ENOMEM;
+ char name[80];
+
+ snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
+ client->monc.auth->global_id);
+
+ client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
+ if (!client->debugfs_dir)
+ goto out;
+
+ client->monc.debugfs_file = debugfs_create_file("monc",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &monc_show_fops);
+ if (!client->monc.debugfs_file)
+ goto out;
+
+ client->osdc.debugfs_file = debugfs_create_file("osdc",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &osdc_show_fops);
+ if (!client->osdc.debugfs_file)
+ goto out;
+
+ client->debugfs_monmap = debugfs_create_file("monmap",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &monmap_show_fops);
+ if (!client->debugfs_monmap)
+ goto out;
+
+ client->debugfs_osdmap = debugfs_create_file("osdmap",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &osdmap_show_fops);
+ if (!client->debugfs_osdmap)
+ goto out;
+
+ return 0;
+
+out:
+ ceph_debugfs_client_cleanup(client);
+ return ret;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+ debugfs_remove(client->debugfs_osdmap);
+ debugfs_remove(client->debugfs_monmap);
+ debugfs_remove(client->osdc.debugfs_file);
+ debugfs_remove(client->monc.debugfs_file);
+ debugfs_remove(client->debugfs_dir);
+}
+
+#else /* CONFIG_DEBUG_FS */
+
+int ceph_debugfs_init(void)
+{
+ return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+ return 0;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+}
+
+#endif /* CONFIG_DEBUG_FS */
+
+EXPORT_SYMBOL(ceph_debugfs_init);
+EXPORT_SYMBOL(ceph_debugfs_cleanup);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
new file mode 100644
index 00000000000..0e8157ee5d4
--- /dev/null
+++ b/net/ceph/messenger.c
@@ -0,0 +1,2453 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/crc32c.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/inet.h>
+#include <linux/kthread.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <net/tcp.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
+
+/*
+ * Ceph uses the messenger to exchange ceph_msg messages with other
+ * hosts in the system. The messenger provides ordered and reliable
+ * delivery. We tolerate TCP disconnects by reconnecting (with
+ * exponential backoff) in the case of a fault (disconnection, bad
+ * crc, protocol error). Acks allow sent messages to be discarded by
+ * the sender.
+ */
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key socket_class;
+#endif
+
+
+static void queue_con(struct ceph_connection *con);
+static void con_work(struct work_struct *);
+static void ceph_fault(struct ceph_connection *con);
+
+/*
+ * nicely render a sockaddr as a string.
+ */
+#define MAX_ADDR_STR 20
+#define MAX_ADDR_STR_LEN 60
+static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
+static DEFINE_SPINLOCK(addr_str_lock);
+static int last_addr_str;
+
+const char *ceph_pr_addr(const struct sockaddr_storage *ss)
+{
+ int i;
+ char *s;
+ struct sockaddr_in *in4 = (void *)ss;
+ struct sockaddr_in6 *in6 = (void *)ss;
+
+ spin_lock(&addr_str_lock);
+ i = last_addr_str++;
+ if (last_addr_str == MAX_ADDR_STR)
+ last_addr_str = 0;
+ spin_unlock(&addr_str_lock);
+ s = addr_str[i];
+
+ switch (ss->ss_family) {
+ case AF_INET:
+ snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
+ (unsigned int)ntohs(in4->sin_port));
+ break;
+
+ case AF_INET6:
+ snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
+ (unsigned int)ntohs(in6->sin6_port));
+ break;
+
+ default:
+ sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
+ }
+
+ return s;
+}
+EXPORT_SYMBOL(ceph_pr_addr);
+
+static void encode_my_addr(struct ceph_messenger *msgr)
+{
+ memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
+ ceph_encode_addr(&msgr->my_enc_addr);
+}
+
+/*
+ * work queue for all reading and writing to/from the socket.
+ */
+struct workqueue_struct *ceph_msgr_wq;
+
+int ceph_msgr_init(void)
+{
+ ceph_msgr_wq = create_workqueue("ceph-msgr");
+ if (IS_ERR(ceph_msgr_wq)) {
+ int ret = PTR_ERR(ceph_msgr_wq);
+ pr_err("msgr_init failed to create workqueue: %d\n", ret);
+ ceph_msgr_wq = NULL;
+ return ret;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ceph_msgr_init);
+
+void ceph_msgr_exit(void)
+{
+ destroy_workqueue(ceph_msgr_wq);
+}
+EXPORT_SYMBOL(ceph_msgr_exit);
+
+void ceph_msgr_flush(void)
+{
+ flush_workqueue(ceph_msgr_wq);
+}
+EXPORT_SYMBOL(ceph_msgr_flush);
+
+
+/*
+ * socket callback functions
+ */
+
+/* data available on socket, or listen socket received a connect */
+static void ceph_data_ready(struct sock *sk, int count_unused)
+{
+ struct ceph_connection *con =
+ (struct ceph_connection *)sk->sk_user_data;
+ if (sk->sk_state != TCP_CLOSE_WAIT) {
+ dout("ceph_data_ready on %p state = %lu, queueing work\n",
+ con, con->state);
+ queue_con(con);
+ }
+}
+
+/* socket has buffer space for writing */
+static void ceph_write_space(struct sock *sk)
+{
+ struct ceph_connection *con =
+ (struct ceph_connection *)sk->sk_user_data;
+
+ /* only queue to workqueue if there is data we want to write. */
+ if (test_bit(WRITE_PENDING, &con->state)) {
+ dout("ceph_write_space %p queueing write work\n", con);
+ queue_con(con);
+ } else {
+ dout("ceph_write_space %p nothing to write\n", con);
+ }
+
+ /* since we have our own write_space, clear the SOCK_NOSPACE flag */
+ clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+}
+
+/* socket's state has changed */
+static void ceph_state_change(struct sock *sk)
+{
+ struct ceph_connection *con =
+ (struct ceph_connection *)sk->sk_user_data;
+
+ dout("ceph_state_change %p state = %lu sk_state = %u\n",
+ con, con->state, sk->sk_state);
+
+ if (test_bit(CLOSED, &con->state))
+ return;
+
+ switch (sk->sk_state) {
+ case TCP_CLOSE:
+ dout("ceph_state_change TCP_CLOSE\n");
+ case TCP_CLOSE_WAIT:
+ dout("ceph_state_change TCP_CLOSE_WAIT\n");
+ if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
+ if (test_bit(CONNECTING, &con->state))
+ con->error_msg = "connection failed";
+ else
+ con->error_msg = "socket closed";
+ queue_con(con);
+ }
+ break;
+ case TCP_ESTABLISHED:
+ dout("ceph_state_change TCP_ESTABLISHED\n");
+ queue_con(con);
+ break;
+ }
+}
+
+/*
+ * set up socket callbacks
+ */
+static void set_sock_callbacks(struct socket *sock,
+ struct ceph_connection *con)
+{
+ struct sock *sk = sock->sk;
+ sk->sk_user_data = (void *)con;
+ sk->sk_data_ready = ceph_data_ready;
+ sk->sk_write_space = ceph_write_space;
+ sk->sk_state_change = ceph_state_change;
+}
+
+
+/*
+ * socket helpers
+ */
+
+/*
+ * initiate connection to a remote socket.
+ */
+static struct socket *ceph_tcp_connect(struct ceph_connection *con)
+{
+ struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
+ struct socket *sock;
+ int ret;
+
+ BUG_ON(con->sock);
+ ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
+ IPPROTO_TCP, &sock);
+ if (ret)
+ return ERR_PTR(ret);
+ con->sock = sock;
+ sock->sk->sk_allocation = GFP_NOFS;
+
+#ifdef CONFIG_LOCKDEP
+ lockdep_set_class(&sock->sk->sk_lock, &socket_class);
+#endif
+
+ set_sock_callbacks(sock, con);
+
+ dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
+
+ ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
+ O_NONBLOCK);
+ if (ret == -EINPROGRESS) {
+ dout("connect %s EINPROGRESS sk_state = %u\n",
+ ceph_pr_addr(&con->peer_addr.in_addr),
+ sock->sk->sk_state);
+ ret = 0;
+ }
+ if (ret < 0) {
+ pr_err("connect %s error %d\n",
+ ceph_pr_addr(&con->peer_addr.in_addr), ret);
+ sock_release(sock);
+ con->sock = NULL;
+ con->error_msg = "connect error";
+ }
+
+ if (ret < 0)
+ return ERR_PTR(ret);
+ return sock;
+}
+
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+ struct kvec iov = {buf, len};
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+
+ return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+}
+
+/*
+ * write something. @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+ size_t kvlen, size_t len, int more)
+{
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+
+ if (more)
+ msg.msg_flags |= MSG_MORE;
+ else
+ msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
+
+ return kernel_sendmsg(sock, &msg, iov, kvlen, len);
+}
+
+
+/*
+ * Shutdown/close the socket for the given connection.
+ */
+static int con_close_socket(struct ceph_connection *con)
+{
+ int rc;
+
+ dout("con_close_socket on %p sock %p\n", con, con->sock);
+ if (!con->sock)
+ return 0;
+ set_bit(SOCK_CLOSED, &con->state);
+ rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
+ sock_release(con->sock);
+ con->sock = NULL;
+ clear_bit(SOCK_CLOSED, &con->state);
+ return rc;
+}
+
+/*
+ * Reset a connection. Discard all incoming and outgoing messages
+ * and clear *_seq state.
+ */
+static void ceph_msg_remove(struct ceph_msg *msg)
+{
+ list_del_init(&msg->list_head);
+ ceph_msg_put(msg);
+}
+static void ceph_msg_remove_list(struct list_head *head)
+{
+ while (!list_empty(head)) {
+ struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
+ list_head);
+ ceph_msg_remove(msg);
+ }
+}
+
+static void reset_connection(struct ceph_connection *con)
+{
+ /* reset connection, out_queue, msg_ and connect_seq */
+ /* discard existing out_queue and msg_seq */
+ ceph_msg_remove_list(&con->out_queue);
+ ceph_msg_remove_list(&con->out_sent);
+
+ if (con->in_msg) {
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ }
+
+ con->connect_seq = 0;
+ con->out_seq = 0;
+ if (con->out_msg) {
+ ceph_msg_put(con->out_msg);
+ con->out_msg = NULL;
+ }
+ con->out_keepalive_pending = false;
+ con->in_seq = 0;
+ con->in_seq_acked = 0;
+}
+
+/*
+ * mark a peer down. drop any open connections.
+ */
+void ceph_con_close(struct ceph_connection *con)
+{
+ dout("con_close %p peer %s\n", con,
+ ceph_pr_addr(&con->peer_addr.in_addr));
+ set_bit(CLOSED, &con->state); /* in case there's queued work */
+ clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
+ clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
+ clear_bit(KEEPALIVE_PENDING, &con->state);
+ clear_bit(WRITE_PENDING, &con->state);
+ mutex_lock(&con->mutex);
+ reset_connection(con);
+ con->peer_global_seq = 0;
+ cancel_delayed_work(&con->work);
+ mutex_unlock(&con->mutex);
+ queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_close);
+
+/*
+ * Reopen a closed connection, with a new peer address.
+ */
+void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
+{
+ dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
+ set_bit(OPENING, &con->state);
+ clear_bit(CLOSED, &con->state);
+ memcpy(&con->peer_addr, addr, sizeof(*addr));
+ con->delay = 0; /* reset backoff memory */
+ queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_open);
+
+/*
+ * return true if this connection ever successfully opened
+ */
+bool ceph_con_opened(struct ceph_connection *con)
+{
+ return con->connect_seq > 0;
+}
+
+/*
+ * generic get/put
+ */
+struct ceph_connection *ceph_con_get(struct ceph_connection *con)
+{
+ dout("con_get %p nref = %d -> %d\n", con,
+ atomic_read(&con->nref), atomic_read(&con->nref) + 1);
+ if (atomic_inc_not_zero(&con->nref))
+ return con;
+ return NULL;
+}
+
+void ceph_con_put(struct ceph_connection *con)
+{
+ dout("con_put %p nref = %d -> %d\n", con,
+ atomic_read(&con->nref), atomic_read(&con->nref) - 1);
+ BUG_ON(atomic_read(&con->nref) == 0);
+ if (atomic_dec_and_test(&con->nref)) {
+ BUG_ON(con->sock);
+ kfree(con);
+ }
+}
+
+/*
+ * initialize a new connection.
+ */
+void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
+{
+ dout("con_init %p\n", con);
+ memset(con, 0, sizeof(*con));
+ atomic_set(&con->nref, 1);
+ con->msgr = msgr;
+ mutex_init(&con->mutex);
+ INIT_LIST_HEAD(&con->out_queue);
+ INIT_LIST_HEAD(&con->out_sent);
+ INIT_DELAYED_WORK(&con->work, con_work);
+}
+EXPORT_SYMBOL(ceph_con_init);
+
+
+/*
+ * We maintain a global counter to order connection attempts. Get
+ * a unique seq greater than @gt.
+ */
+static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+{
+ u32 ret;
+
+ spin_lock(&msgr->global_seq_lock);
+ if (msgr->global_seq < gt)
+ msgr->global_seq = gt;
+ ret = ++msgr->global_seq;
+ spin_unlock(&msgr->global_seq_lock);
+ return ret;
+}
+
+
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off. Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con, int v)
+{
+ struct ceph_msg *m = con->out_msg;
+
+ dout("prepare_write_message_footer %p\n", con);
+ con->out_kvec_is_msg = true;
+ con->out_kvec[v].iov_base = &m->footer;
+ con->out_kvec[v].iov_len = sizeof(m->footer);
+ con->out_kvec_bytes += sizeof(m->footer);
+ con->out_kvec_left++;
+ con->out_more = m->more_to_follow;
+ con->out_msg_done = true;
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+ struct ceph_msg *m;
+ int v = 0;
+
+ con->out_kvec_bytes = 0;
+ con->out_kvec_is_msg = true;
+ con->out_msg_done = false;
+
+ /* Sneak an ack in there first? If we can get it into the same
+ * TCP packet that's a good thing. */
+ if (con->in_seq > con->in_seq_acked) {
+ con->in_seq_acked = con->in_seq;
+ con->out_kvec[v].iov_base = &tag_ack;
+ con->out_kvec[v++].iov_len = 1;
+ con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con->out_kvec[v].iov_base = &con->out_temp_ack;
+ con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
+ con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+ }
+
+ m = list_first_entry(&con->out_queue,
+ struct ceph_msg, list_head);
+ con->out_msg = m;
+ if (test_bit(LOSSYTX, &con->state)) {
+ list_del_init(&m->list_head);
+ } else {
+ /* put message on sent list */
+ ceph_msg_get(m);
+ list_move_tail(&m->list_head, &con->out_sent);
+ }
+
+ /*
+ * only assign outgoing seq # if we haven't sent this message
+ * yet. if it is requeued, resend with it's original seq.
+ */
+ if (m->needs_out_seq) {
+ m->hdr.seq = cpu_to_le64(++con->out_seq);
+ m->needs_out_seq = false;
+ }
+
+ dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
+ m, con->out_seq, le16_to_cpu(m->hdr.type),
+ le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+ le32_to_cpu(m->hdr.data_len),
+ m->nr_pages);
+ BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
+
+ /* tag + hdr + front + middle */
+ con->out_kvec[v].iov_base = &tag_msg;
+ con->out_kvec[v++].iov_len = 1;
+ con->out_kvec[v].iov_base = &m->hdr;
+ con->out_kvec[v++].iov_len = sizeof(m->hdr);
+ con->out_kvec[v++] = m->front;
+ if (m->middle)
+ con->out_kvec[v++] = m->middle->vec;
+ con->out_kvec_left = v;
+ con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
+ (m->middle ? m->middle->vec.iov_len : 0);
+ con->out_kvec_cur = con->out_kvec;
+
+ /* fill in crc (except data pages), footer */
+ con->out_msg->hdr.crc =
+ cpu_to_le32(crc32c(0, (void *)&m->hdr,
+ sizeof(m->hdr) - sizeof(m->hdr.crc)));
+ con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
+ con->out_msg->footer.front_crc =
+ cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
+ if (m->middle)
+ con->out_msg->footer.middle_crc =
+ cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
+ m->middle->vec.iov_len));
+ else
+ con->out_msg->footer.middle_crc = 0;
+ con->out_msg->footer.data_crc = 0;
+ dout("prepare_write_message front_crc %u data_crc %u\n",
+ le32_to_cpu(con->out_msg->footer.front_crc),
+ le32_to_cpu(con->out_msg->footer.middle_crc));
+
+ /* is there a data payload? */
+ if (le32_to_cpu(m->hdr.data_len) > 0) {
+ /* initialize page iterator */
+ con->out_msg_pos.page = 0;
+ if (m->pages)
+ con->out_msg_pos.page_pos =
+ le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
+ else
+ con->out_msg_pos.page_pos = 0;
+ con->out_msg_pos.data_pos = 0;
+ con->out_msg_pos.did_page_crc = 0;
+ con->out_more = 1; /* data + footer will follow */
+ } else {
+ /* no, queue up footer too and be done */
+ prepare_write_message_footer(con, v);
+ }
+
+ set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+ dout("prepare_write_ack %p %llu -> %llu\n", con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+
+ con->out_kvec[0].iov_base = &tag_ack;
+ con->out_kvec[0].iov_len = 1;
+ con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con->out_kvec[1].iov_base = &con->out_temp_ack;
+ con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
+ con->out_kvec_left = 2;
+ con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+ con->out_kvec_cur = con->out_kvec;
+ con->out_more = 1; /* more will follow.. eventually.. */
+ set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+ dout("prepare_write_keepalive %p\n", con);
+ con->out_kvec[0].iov_base = &tag_keepalive;
+ con->out_kvec[0].iov_len = 1;
+ con->out_kvec_left = 1;
+ con->out_kvec_bytes = 1;
+ con->out_kvec_cur = con->out_kvec;
+ set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+static void prepare_connect_authorizer(struct ceph_connection *con)
+{
+ void *auth_buf;
+ int auth_len = 0;
+ int auth_protocol = 0;
+
+ mutex_unlock(&con->mutex);
+ if (con->ops->get_authorizer)
+ con->ops->get_authorizer(con, &auth_buf, &auth_len,
+ &auth_protocol, &con->auth_reply_buf,
+ &con->auth_reply_buf_len,
+ con->auth_retry);
+ mutex_lock(&con->mutex);
+
+ con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
+ con->out_connect.authorizer_len = cpu_to_le32(auth_len);
+
+ con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
+ con->out_kvec[con->out_kvec_left].iov_len = auth_len;
+ con->out_kvec_left++;
+ con->out_kvec_bytes += auth_len;
+}
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_messenger *msgr,
+ struct ceph_connection *con)
+{
+ int len = strlen(CEPH_BANNER);
+
+ con->out_kvec[0].iov_base = CEPH_BANNER;
+ con->out_kvec[0].iov_len = len;
+ con->out_kvec[1].iov_base = &msgr->my_enc_addr;
+ con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
+ con->out_kvec_left = 2;
+ con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
+ con->out_kvec_cur = con->out_kvec;
+ con->out_more = 0;
+ set_bit(WRITE_PENDING, &con->state);
+}
+
+static void prepare_write_connect(struct ceph_messenger *msgr,
+ struct ceph_connection *con,
+ int after_banner)
+{
+ unsigned global_seq = get_global_seq(con->msgr, 0);
+ int proto;
+
+ switch (con->peer_name.type) {
+ case CEPH_ENTITY_TYPE_MON:
+ proto = CEPH_MONC_PROTOCOL;
+ break;
+ case CEPH_ENTITY_TYPE_OSD:
+ proto = CEPH_OSDC_PROTOCOL;
+ break;
+ case CEPH_ENTITY_TYPE_MDS:
+ proto = CEPH_MDSC_PROTOCOL;
+ break;
+ default:
+ BUG();
+ }
+
+ dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+ con->connect_seq, global_seq, proto);
+
+ con->out_connect.features = cpu_to_le64(msgr->supported_features);
+ con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+ con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
+ con->out_connect.global_seq = cpu_to_le32(global_seq);
+ con->out_connect.protocol_version = cpu_to_le32(proto);
+ con->out_connect.flags = 0;
+
+ if (!after_banner) {
+ con->out_kvec_left = 0;
+ con->out_kvec_bytes = 0;
+ }
+ con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
+ con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
+ con->out_kvec_left++;
+ con->out_kvec_bytes += sizeof(con->out_connect);
+ con->out_kvec_cur = con->out_kvec;
+ con->out_more = 0;
+ set_bit(WRITE_PENDING, &con->state);
+
+ prepare_connect_authorizer(con);
+}
+
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ * 1 -> done
+ * 0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
+ while (con->out_kvec_bytes > 0) {
+ ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
+ con->out_kvec_left, con->out_kvec_bytes,
+ con->out_more);
+ if (ret <= 0)
+ goto out;
+ con->out_kvec_bytes -= ret;
+ if (con->out_kvec_bytes == 0)
+ break; /* done */
+ while (ret > 0) {
+ if (ret >= con->out_kvec_cur->iov_len) {
+ ret -= con->out_kvec_cur->iov_len;
+ con->out_kvec_cur++;
+ con->out_kvec_left--;
+ } else {
+ con->out_kvec_cur->iov_len -= ret;
+ con->out_kvec_cur->iov_base += ret;
+ ret = 0;
+ break;
+ }
+ }
+ }
+ con->out_kvec_left = 0;
+ con->out_kvec_is_msg = false;
+ ret = 1;
+out:
+ dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+ con->out_kvec_bytes, con->out_kvec_left, ret);
+ return ret; /* done! */
+}
+
+#ifdef CONFIG_BLOCK
+static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
+{
+ if (!bio) {
+ *iter = NULL;
+ *seg = 0;
+ return;
+ }
+ *iter = bio;
+ *seg = bio->bi_idx;
+}
+
+static void iter_bio_next(struct bio **bio_iter, int *seg)
+{
+ if (*bio_iter == NULL)
+ return;
+
+ BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
+
+ (*seg)++;
+ if (*seg == (*bio_iter)->bi_vcnt)
+ init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
+}
+#endif
+
+/*
+ * Write as much message data payload as we can. If we finish, queue
+ * up the footer.
+ * 1 -> done, footer is now queued in out_kvec[].
+ * 0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_msg_pages(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->out_msg;
+ unsigned data_len = le32_to_cpu(msg->hdr.data_len);
+ size_t len;
+ int crc = con->msgr->nocrc;
+ int ret;
+ int total_max_write;
+ int in_trail = 0;
+ size_t trail_len = (msg->trail ? msg->trail->length : 0);
+
+ dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
+ con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
+ con->out_msg_pos.page_pos);
+
+#ifdef CONFIG_BLOCK
+ if (msg->bio && !msg->bio_iter)
+ init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
+#endif
+
+ while (data_len > con->out_msg_pos.data_pos) {
+ struct page *page = NULL;
+ void *kaddr = NULL;
+ int max_write = PAGE_SIZE;
+ int page_shift = 0;
+
+ total_max_write = data_len - trail_len -
+ con->out_msg_pos.data_pos;
+
+ /*
+ * if we are calculating the data crc (the default), we need
+ * to map the page. if our pages[] has been revoked, use the
+ * zero page.
+ */
+
+ /* have we reached the trail part of the data? */
+ if (con->out_msg_pos.data_pos >= data_len - trail_len) {
+ in_trail = 1;
+
+ total_max_write = data_len - con->out_msg_pos.data_pos;
+
+ page = list_first_entry(&msg->trail->head,
+ struct page, lru);
+ if (crc)
+ kaddr = kmap(page);
+ max_write = PAGE_SIZE;
+ } else if (msg->pages) {
+ page = msg->pages[con->out_msg_pos.page];
+ if (crc)
+ kaddr = kmap(page);
+ } else if (msg->pagelist) {
+ page = list_first_entry(&msg->pagelist->head,
+ struct page, lru);
+ if (crc)
+ kaddr = kmap(page);
+#ifdef CONFIG_BLOCK
+ } else if (msg->bio) {
+ struct bio_vec *bv;
+
+ bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
+ page = bv->bv_page;
+ page_shift = bv->bv_offset;
+ if (crc)
+ kaddr = kmap(page) + page_shift;
+ max_write = bv->bv_len;
+#endif
+ } else {
+ page = con->msgr->zero_page;
+ if (crc)
+ kaddr = page_address(con->msgr->zero_page);
+ }
+ len = min_t(int, max_write - con->out_msg_pos.page_pos,
+ total_max_write);
+
+ if (crc && !con->out_msg_pos.did_page_crc) {
+ void *base = kaddr + con->out_msg_pos.page_pos;
+ u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
+
+ BUG_ON(kaddr == NULL);
+ con->out_msg->footer.data_crc =
+ cpu_to_le32(crc32c(tmpcrc, base, len));
+ con->out_msg_pos.did_page_crc = 1;
+ }
+ ret = kernel_sendpage(con->sock, page,
+ con->out_msg_pos.page_pos + page_shift,
+ len,
+ MSG_DONTWAIT | MSG_NOSIGNAL |
+ MSG_MORE);
+
+ if (crc &&
+ (msg->pages || msg->pagelist || msg->bio || in_trail))
+ kunmap(page);
+
+ if (ret <= 0)
+ goto out;
+
+ con->out_msg_pos.data_pos += ret;
+ con->out_msg_pos.page_pos += ret;
+ if (ret == len) {
+ con->out_msg_pos.page_pos = 0;
+ con->out_msg_pos.page++;
+ con->out_msg_pos.did_page_crc = 0;
+ if (in_trail)
+ list_move_tail(&page->lru,
+ &msg->trail->head);
+ else if (msg->pagelist)
+ list_move_tail(&page->lru,
+ &msg->pagelist->head);
+#ifdef CONFIG_BLOCK
+ else if (msg->bio)
+ iter_bio_next(&msg->bio_iter, &msg->bio_seg);
+#endif
+ }
+ }
+
+ dout("write_partial_msg_pages %p msg %p done\n", con, msg);
+
+ /* prepare and queue up footer, too */
+ if (!crc)
+ con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+ con->out_kvec_bytes = 0;
+ con->out_kvec_left = 0;
+ con->out_kvec_cur = con->out_kvec;
+ prepare_write_message_footer(con, 0);
+ ret = 1;
+out:
+ return ret;
+}
+
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+ int ret;
+
+ while (con->out_skip > 0) {
+ struct kvec iov = {
+ .iov_base = page_address(con->msgr->zero_page),
+ .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
+ };
+
+ ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
+ if (ret <= 0)
+ goto out;
+ con->out_skip -= ret;
+ }
+ ret = 1;
+out:
+ return ret;
+}
+
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+ dout("prepare_read_banner %p\n", con);
+ con->in_base_pos = 0;
+}
+
+static void prepare_read_connect(struct ceph_connection *con)
+{
+ dout("prepare_read_connect %p\n", con);
+ con->in_base_pos = 0;
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+ dout("prepare_read_ack %p\n", con);
+ con->in_base_pos = 0;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+ dout("prepare_read_tag %p\n", con);
+ con->in_base_pos = 0;
+ con->in_tag = CEPH_MSGR_TAG_READY;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+ dout("prepare_read_message %p\n", con);
+ BUG_ON(con->in_msg != NULL);
+ con->in_base_pos = 0;
+ con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+ return 0;
+}
+
+
+static int read_partial(struct ceph_connection *con,
+ int *to, int size, void *object)
+{
+ *to += size;
+ while (con->in_base_pos < *to) {
+ int left = *to - con->in_base_pos;
+ int have = size - left;
+ int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+ if (ret <= 0)
+ return ret;
+ con->in_base_pos += ret;
+ }
+ return 1;
+}
+
+
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+ int ret, to = 0;
+
+ dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
+
+ /* peer's banner */
+ ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
+ if (ret <= 0)
+ goto out;
+ ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
+ &con->actual_peer_addr);
+ if (ret <= 0)
+ goto out;
+ ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
+ &con->peer_addr_for_me);
+ if (ret <= 0)
+ goto out;
+out:
+ return ret;
+}
+
+static int read_partial_connect(struct ceph_connection *con)
+{
+ int ret, to = 0;
+
+ dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+
+ ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
+ if (ret <= 0)
+ goto out;
+ ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
+ con->auth_reply_buf);
+ if (ret <= 0)
+ goto out;
+
+ dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+ con, (int)con->in_reply.tag,
+ le32_to_cpu(con->in_reply.connect_seq),
+ le32_to_cpu(con->in_reply.global_seq));
+out:
+ return ret;
+
+}
+
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+ if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+ pr_err("connect to %s got bad banner\n",
+ ceph_pr_addr(&con->peer_addr.in_addr));
+ con->error_msg = "protocol error, bad banner";
+ return -1;
+ }
+ return 0;
+}
+
+static bool addr_is_blank(struct sockaddr_storage *ss)
+{
+ switch (ss->ss_family) {
+ case AF_INET:
+ return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
+ case AF_INET6:
+ return
+ ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
+ ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
+ ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
+ ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
+ }
+ return false;
+}
+
+static int addr_port(struct sockaddr_storage *ss)
+{
+ switch (ss->ss_family) {
+ case AF_INET:
+ return ntohs(((struct sockaddr_in *)ss)->sin_port);
+ case AF_INET6:
+ return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
+ }
+ return 0;
+}
+
+static void addr_set_port(struct sockaddr_storage *ss, int p)
+{
+ switch (ss->ss_family) {
+ case AF_INET:
+ ((struct sockaddr_in *)ss)->sin_port = htons(p);
+ case AF_INET6:
+ ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
+ }
+}
+
+/*
+ * Parse an ip[:port] list into an addr array. Use the default
+ * monitor port if a port isn't specified.
+ */
+int ceph_parse_ips(const char *c, const char *end,
+ struct ceph_entity_addr *addr,
+ int max_count, int *count)
+{
+ int i;
+ const char *p = c;
+
+ dout("parse_ips on '%.*s'\n", (int)(end-c), c);
+ for (i = 0; i < max_count; i++) {
+ const char *ipend;
+ struct sockaddr_storage *ss = &addr[i].in_addr;
+ struct sockaddr_in *in4 = (void *)ss;
+ struct sockaddr_in6 *in6 = (void *)ss;
+ int port;
+ char delim = ',';
+
+ if (*p == '[') {
+ delim = ']';
+ p++;
+ }
+
+ memset(ss, 0, sizeof(*ss));
+ if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
+ delim, &ipend))
+ ss->ss_family = AF_INET;
+ else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
+ delim, &ipend))
+ ss->ss_family = AF_INET6;
+ else
+ goto bad;
+ p = ipend;
+
+ if (delim == ']') {
+ if (*p != ']') {
+ dout("missing matching ']'\n");
+ goto bad;
+ }
+ p++;
+ }
+
+ /* port? */
+ if (p < end && *p == ':') {
+ port = 0;
+ p++;
+ while (p < end && *p >= '0' && *p <= '9') {
+ port = (port * 10) + (*p - '0');
+ p++;
+ }
+ if (port > 65535 || port == 0)
+ goto bad;
+ } else {
+ port = CEPH_MON_PORT;
+ }
+
+ addr_set_port(ss, port);
+
+ dout("parse_ips got %s\n", ceph_pr_addr(ss));
+
+ if (p == end)
+ break;
+ if (*p != ',')
+ goto bad;
+ p++;
+ }
+
+ if (p != end)
+ goto bad;
+
+ if (count)
+ *count = i + 1;
+ return 0;
+
+bad:
+ pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
+ return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_parse_ips);
+
+static int process_banner(struct ceph_connection *con)
+{
+ dout("process_banner on %p\n", con);
+
+ if (verify_hello(con) < 0)
+ return -1;
+
+ ceph_decode_addr(&con->actual_peer_addr);
+ ceph_decode_addr(&con->peer_addr_for_me);
+
+ /*
+ * Make sure the other end is who we wanted. note that the other
+ * end may not yet know their ip address, so if it's 0.0.0.0, give
+ * them the benefit of the doubt.
+ */
+ if (memcmp(&con->peer_addr, &con->actual_peer_addr,
+ sizeof(con->peer_addr)) != 0 &&
+ !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
+ con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+ pr_warning("wrong peer, want %s/%d, got %s/%d\n",
+ ceph_pr_addr(&con->peer_addr.in_addr),
+ (int)le32_to_cpu(con->peer_addr.nonce),
+ ceph_pr_addr(&con->actual_peer_addr.in_addr),
+ (int)le32_to_cpu(con->actual_peer_addr.nonce));
+ con->error_msg = "wrong peer at address";
+ return -1;
+ }
+
+ /*
+ * did we learn our address?
+ */
+ if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
+ int port = addr_port(&con->msgr->inst.addr.in_addr);
+
+ memcpy(&con->msgr->inst.addr.in_addr,
+ &con->peer_addr_for_me.in_addr,
+ sizeof(con->peer_addr_for_me.in_addr));
+ addr_set_port(&con->msgr->inst.addr.in_addr, port);
+ encode_my_addr(con->msgr);
+ dout("process_banner learned my addr is %s\n",
+ ceph_pr_addr(&con->msgr->inst.addr.in_addr));
+ }
+
+ set_bit(NEGOTIATING, &con->state);
+ prepare_read_connect(con);
+ return 0;
+}
+
+static void fail_protocol(struct ceph_connection *con)
+{
+ reset_connection(con);
+ set_bit(CLOSED, &con->state); /* in case there's queued work */
+
+ mutex_unlock(&con->mutex);
+ if (con->ops->bad_proto)
+ con->ops->bad_proto(con);
+ mutex_lock(&con->mutex);
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+ u64 sup_feat = con->msgr->supported_features;
+ u64 req_feat = con->msgr->required_features;
+ u64 server_feat = le64_to_cpu(con->in_reply.features);
+
+ dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+
+ switch (con->in_reply.tag) {
+ case CEPH_MSGR_TAG_FEATURES:
+ pr_err("%s%lld %s feature set mismatch,"
+ " my %llx < server's %llx, missing %llx\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr.in_addr),
+ sup_feat, server_feat, server_feat & ~sup_feat);
+ con->error_msg = "missing required protocol features";
+ fail_protocol(con);
+ return -1;
+
+ case CEPH_MSGR_TAG_BADPROTOVER:
+ pr_err("%s%lld %s protocol version mismatch,"
+ " my %d != server's %d\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr.in_addr),
+ le32_to_cpu(con->out_connect.protocol_version),
+ le32_to_cpu(con->in_reply.protocol_version));
+ con->error_msg = "protocol version mismatch";
+ fail_protocol(con);
+ return -1;
+
+ case CEPH_MSGR_TAG_BADAUTHORIZER:
+ con->auth_retry++;
+ dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+ con->auth_retry);
+ if (con->auth_retry == 2) {
+ con->error_msg = "connect authorization failure";
+ reset_connection(con);
+ set_bit(CLOSED, &con->state);
+ return -1;
+ }
+ con->auth_retry = 1;
+ prepare_write_connect(con->msgr, con, 0);
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_RESETSESSION:
+ /*
+ * If we connected with a large connect_seq but the peer
+ * has no record of a session with us (no connection, or
+ * connect_seq == 0), they will send RESETSESION to indicate
+ * that they must have reset their session, and may have
+ * dropped messages.
+ */
+ dout("process_connect got RESET peer seq %u\n",
+ le32_to_cpu(con->in_connect.connect_seq));
+ pr_err("%s%lld %s connection reset\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr.in_addr));
+ reset_connection(con);
+ prepare_write_connect(con->msgr, con, 0);
+ prepare_read_connect(con);
+
+ /* Tell ceph about it. */
+ mutex_unlock(&con->mutex);
+ pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
+ if (con->ops->peer_reset)
+ con->ops->peer_reset(con);
+ mutex_lock(&con->mutex);
+ break;
+
+ case CEPH_MSGR_TAG_RETRY_SESSION:
+ /*
+ * If we sent a smaller connect_seq than the peer has, try
+ * again with a larger value.
+ */
+ dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
+ le32_to_cpu(con->out_connect.connect_seq),
+ le32_to_cpu(con->in_connect.connect_seq));
+ con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
+ prepare_write_connect(con->msgr, con, 0);
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_RETRY_GLOBAL:
+ /*
+ * If we sent a smaller global_seq than the peer has, try
+ * again with a larger value.
+ */
+ dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+ con->peer_global_seq,
+ le32_to_cpu(con->in_connect.global_seq));
+ get_global_seq(con->msgr,
+ le32_to_cpu(con->in_connect.global_seq));
+ prepare_write_connect(con->msgr, con, 0);
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_READY:
+ if (req_feat & ~server_feat) {
+ pr_err("%s%lld %s protocol feature mismatch,"
+ " my required %llx > server's %llx, need %llx\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr.in_addr),
+ req_feat, server_feat, req_feat & ~server_feat);
+ con->error_msg = "missing required protocol features";
+ fail_protocol(con);
+ return -1;
+ }
+ clear_bit(CONNECTING, &con->state);
+ con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
+ con->connect_seq++;
+ con->peer_features = server_feat;
+ dout("process_connect got READY gseq %d cseq %d (%d)\n",
+ con->peer_global_seq,
+ le32_to_cpu(con->in_reply.connect_seq),
+ con->connect_seq);
+ WARN_ON(con->connect_seq !=
+ le32_to_cpu(con->in_reply.connect_seq));
+
+ if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+ set_bit(LOSSYTX, &con->state);
+
+ prepare_read_tag(con);
+ break;
+
+ case CEPH_MSGR_TAG_WAIT:
+ /*
+ * If there is a connection race (we are opening
+ * connections to each other), one of us may just have
+ * to WAIT. This shouldn't happen if we are the
+ * client.
+ */
+ pr_err("process_connect peer connecting WAIT\n");
+
+ default:
+ pr_err("connect protocol error, will retry\n");
+ con->error_msg = "protocol error, garbage tag during connect";
+ return -1;
+ }
+ return 0;
+}
+
+
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+ int to = 0;
+
+ return read_partial(con, &to, sizeof(con->in_temp_ack),
+ &con->in_temp_ack);
+}
+
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+ struct ceph_msg *m;
+ u64 ack = le64_to_cpu(con->in_temp_ack);
+ u64 seq;
+
+ while (!list_empty(&con->out_sent)) {
+ m = list_first_entry(&con->out_sent, struct ceph_msg,
+ list_head);
+ seq = le64_to_cpu(m->hdr.seq);
+ if (seq > ack)
+ break;
+ dout("got ack for seq %llu type %d at %p\n", seq,
+ le16_to_cpu(m->hdr.type), m);
+ ceph_msg_remove(m);
+ }
+ prepare_read_tag(con);
+}
+
+
+
+
+static int read_partial_message_section(struct ceph_connection *con,
+ struct kvec *section,
+ unsigned int sec_len, u32 *crc)
+{
+ int ret, left;
+
+ BUG_ON(!section);
+
+ while (section->iov_len < sec_len) {
+ BUG_ON(section->iov_base == NULL);
+ left = sec_len - section->iov_len;
+ ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+ section->iov_len, left);
+ if (ret <= 0)
+ return ret;
+ section->iov_len += ret;
+ if (section->iov_len == sec_len)
+ *crc = crc32c(0, section->iov_base,
+ section->iov_len);
+ }
+
+ return 1;
+}
+
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip);
+
+
+static int read_partial_message_pages(struct ceph_connection *con,
+ struct page **pages,
+ unsigned data_len, int datacrc)
+{
+ void *p;
+ int ret;
+ int left;
+
+ left = min((int)(data_len - con->in_msg_pos.data_pos),
+ (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
+ /* (page) data */
+ BUG_ON(pages == NULL);
+ p = kmap(pages[con->in_msg_pos.page]);
+ ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+ left);
+ if (ret > 0 && datacrc)
+ con->in_data_crc =
+ crc32c(con->in_data_crc,
+ p + con->in_msg_pos.page_pos, ret);
+ kunmap(pages[con->in_msg_pos.page]);
+ if (ret <= 0)
+ return ret;
+ con->in_msg_pos.data_pos += ret;
+ con->in_msg_pos.page_pos += ret;
+ if (con->in_msg_pos.page_pos == PAGE_SIZE) {
+ con->in_msg_pos.page_pos = 0;
+ con->in_msg_pos.page++;
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_BLOCK
+static int read_partial_message_bio(struct ceph_connection *con,
+ struct bio **bio_iter, int *bio_seg,
+ unsigned data_len, int datacrc)
+{
+ struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
+ void *p;
+ int ret, left;
+
+ if (IS_ERR(bv))
+ return PTR_ERR(bv);
+
+ left = min((int)(data_len - con->in_msg_pos.data_pos),
+ (int)(bv->bv_len - con->in_msg_pos.page_pos));
+
+ p = kmap(bv->bv_page) + bv->bv_offset;
+
+ ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+ left);
+ if (ret > 0 && datacrc)
+ con->in_data_crc =
+ crc32c(con->in_data_crc,
+ p + con->in_msg_pos.page_pos, ret);
+ kunmap(bv->bv_page);
+ if (ret <= 0)
+ return ret;
+ con->in_msg_pos.data_pos += ret;
+ con->in_msg_pos.page_pos += ret;
+ if (con->in_msg_pos.page_pos == bv->bv_len) {
+ con->in_msg_pos.page_pos = 0;
+ iter_bio_next(bio_iter, bio_seg);
+ }
+
+ return ret;
+}
+#endif
+
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+ struct ceph_msg *m = con->in_msg;
+ int ret;
+ int to, left;
+ unsigned front_len, middle_len, data_len, data_off;
+ int datacrc = con->msgr->nocrc;
+ int skip;
+ u64 seq;
+
+ dout("read_partial_message con %p msg %p\n", con, m);
+
+ /* header */
+ while (con->in_base_pos < sizeof(con->in_hdr)) {
+ left = sizeof(con->in_hdr) - con->in_base_pos;
+ ret = ceph_tcp_recvmsg(con->sock,
+ (char *)&con->in_hdr + con->in_base_pos,
+ left);
+ if (ret <= 0)
+ return ret;
+ con->in_base_pos += ret;
+ if (con->in_base_pos == sizeof(con->in_hdr)) {
+ u32 crc = crc32c(0, (void *)&con->in_hdr,
+ sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
+ if (crc != le32_to_cpu(con->in_hdr.crc)) {
+ pr_err("read_partial_message bad hdr "
+ " crc %u != expected %u\n",
+ crc, con->in_hdr.crc);
+ return -EBADMSG;
+ }
+ }
+ }
+ front_len = le32_to_cpu(con->in_hdr.front_len);
+ if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+ return -EIO;
+ middle_len = le32_to_cpu(con->in_hdr.middle_len);
+ if (middle_len > CEPH_MSG_MAX_DATA_LEN)
+ return -EIO;
+ data_len = le32_to_cpu(con->in_hdr.data_len);
+ if (data_len > CEPH_MSG_MAX_DATA_LEN)
+ return -EIO;
+ data_off = le16_to_cpu(con->in_hdr.data_off);
+
+ /* verify seq# */
+ seq = le64_to_cpu(con->in_hdr.seq);
+ if ((s64)seq - (s64)con->in_seq < 1) {
+ pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr.in_addr),
+ seq, con->in_seq + 1);
+ con->in_base_pos = -front_len - middle_len - data_len -
+ sizeof(m->footer);
+ con->in_tag = CEPH_MSGR_TAG_READY;
+ con->in_seq++;
+ return 0;
+ } else if ((s64)seq - (s64)con->in_seq > 1) {
+ pr_err("read_partial_message bad seq %lld expected %lld\n",
+ seq, con->in_seq + 1);
+ con->error_msg = "bad message sequence # for incoming message";
+ return -EBADMSG;
+ }
+
+ /* allocate message? */
+ if (!con->in_msg) {
+ dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+ con->in_hdr.front_len, con->in_hdr.data_len);
+ skip = 0;
+ con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
+ if (skip) {
+ /* skip this message */
+ dout("alloc_msg said skip message\n");
+ BUG_ON(con->in_msg);
+ con->in_base_pos = -front_len - middle_len - data_len -
+ sizeof(m->footer);
+ con->in_tag = CEPH_MSGR_TAG_READY;
+ con->in_seq++;
+ return 0;
+ }
+ if (!con->in_msg) {
+ con->error_msg =
+ "error allocating memory for incoming message";
+ return -ENOMEM;
+ }
+ m = con->in_msg;
+ m->front.iov_len = 0; /* haven't read it yet */
+ if (m->middle)
+ m->middle->vec.iov_len = 0;
+
+ con->in_msg_pos.page = 0;
+ if (m->pages)
+ con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
+ else
+ con->in_msg_pos.page_pos = 0;
+ con->in_msg_pos.data_pos = 0;
+ }
+
+ /* front */
+ ret = read_partial_message_section(con, &m->front, front_len,
+ &con->in_front_crc);
+ if (ret <= 0)
+ return ret;
+
+ /* middle */
+ if (m->middle) {
+ ret = read_partial_message_section(con, &m->middle->vec,
+ middle_len,
+ &con->in_middle_crc);
+ if (ret <= 0)
+ return ret;
+ }
+#ifdef CONFIG_BLOCK
+ if (m->bio && !m->bio_iter)
+ init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
+#endif
+
+ /* (page) data */
+ while (con->in_msg_pos.data_pos < data_len) {
+ if (m->pages) {
+ ret = read_partial_message_pages(con, m->pages,
+ data_len, datacrc);
+ if (ret <= 0)
+ return ret;
+#ifdef CONFIG_BLOCK
+ } else if (m->bio) {
+
+ ret = read_partial_message_bio(con,
+ &m->bio_iter, &m->bio_seg,
+ data_len, datacrc);
+ if (ret <= 0)
+ return ret;
+#endif
+ } else {
+ BUG_ON(1);
+ }
+ }
+
+ /* footer */
+ to = sizeof(m->hdr) + sizeof(m->footer);
+ while (con->in_base_pos < to) {
+ left = to - con->in_base_pos;
+ ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
+ (con->in_base_pos - sizeof(m->hdr)),
+ left);
+ if (ret <= 0)
+ return ret;
+ con->in_base_pos += ret;
+ }
+ dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+ m, front_len, m->footer.front_crc, middle_len,
+ m->footer.middle_crc, data_len, m->footer.data_crc);
+
+ /* crc ok? */
+ if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+ pr_err("read_partial_message %p front crc %u != exp. %u\n",
+ m, con->in_front_crc, m->footer.front_crc);
+ return -EBADMSG;
+ }
+ if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+ pr_err("read_partial_message %p middle crc %u != exp %u\n",
+ m, con->in_middle_crc, m->footer.middle_crc);
+ return -EBADMSG;
+ }
+ if (datacrc &&
+ (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+ con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+ pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+ con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+ return -EBADMSG;
+ }
+
+ return 1; /* done! */
+}
+
+/*
+ * Process message. This happens in the worker thread. The callback should
+ * be careful not to do anything that waits on other incoming messages or it
+ * may deadlock.
+ */
+static void process_message(struct ceph_connection *con)
+{
+ struct ceph_msg *msg;
+
+ msg = con->in_msg;
+ con->in_msg = NULL;
+
+ /* if first message, set peer_name */
+ if (con->peer_name.type == 0)
+ con->peer_name = msg->hdr.src;
+
+ con->in_seq++;
+ mutex_unlock(&con->mutex);
+
+ dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
+ msg, le64_to_cpu(msg->hdr.seq),
+ ENTITY_NAME(msg->hdr.src),
+ le16_to_cpu(msg->hdr.type),
+ ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+ le32_to_cpu(msg->hdr.front_len),
+ le32_to_cpu(msg->hdr.data_len),
+ con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+ con->ops->dispatch(con, msg);
+
+ mutex_lock(&con->mutex);
+ prepare_read_tag(con);
+}
+
+
+/*
+ * Write something to the socket. Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+static int try_write(struct ceph_connection *con)
+{
+ struct ceph_messenger *msgr = con->msgr;
+ int ret = 1;
+
+ dout("try_write start %p state %lu nref %d\n", con, con->state,
+ atomic_read(&con->nref));
+
+more:
+ dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+
+ /* open the socket first? */
+ if (con->sock == NULL) {
+ /*
+ * if we were STANDBY and are reconnecting _this_
+ * connection, bump connect_seq now. Always bump
+ * global_seq.
+ */
+ if (test_and_clear_bit(STANDBY, &con->state))
+ con->connect_seq++;
+
+ prepare_write_banner(msgr, con);
+ prepare_write_connect(msgr, con, 1);
+ prepare_read_banner(con);
+ set_bit(CONNECTING, &con->state);
+ clear_bit(NEGOTIATING, &con->state);
+
+ BUG_ON(con->in_msg);
+ con->in_tag = CEPH_MSGR_TAG_READY;
+ dout("try_write initiating connect on %p new state %lu\n",
+ con, con->state);
+ con->sock = ceph_tcp_connect(con);
+ if (IS_ERR(con->sock)) {
+ con->sock = NULL;
+ con->error_msg = "connect error";
+ ret = -1;
+ goto out;
+ }
+ }
+
+more_kvec:
+ /* kvec data queued? */
+ if (con->out_skip) {
+ ret = write_partial_skip(con);
+ if (ret <= 0)
+ goto done;
+ if (ret < 0) {
+ dout("try_write write_partial_skip err %d\n", ret);
+ goto done;
+ }
+ }
+ if (con->out_kvec_left) {
+ ret = write_partial_kvec(con);
+ if (ret <= 0)
+ goto done;
+ }
+
+ /* msg pages? */
+ if (con->out_msg) {
+ if (con->out_msg_done) {
+ ceph_msg_put(con->out_msg);
+ con->out_msg = NULL; /* we're done with this one */
+ goto do_next;
+ }
+
+ ret = write_partial_msg_pages(con);
+ if (ret == 1)
+ goto more_kvec; /* we need to send the footer, too! */
+ if (ret == 0)
+ goto done;
+ if (ret < 0) {
+ dout("try_write write_partial_msg_pages err %d\n",
+ ret);
+ goto done;
+ }
+ }
+
+do_next:
+ if (!test_bit(CONNECTING, &con->state)) {
+ /* is anything else pending? */
+ if (!list_empty(&con->out_queue)) {
+ prepare_write_message(con);
+ goto more;
+ }
+ if (con->in_seq > con->in_seq_acked) {
+ prepare_write_ack(con);
+ goto more;
+ }
+ if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
+ prepare_write_keepalive(con);
+ goto more;
+ }
+ }
+
+ /* Nothing to do! */
+ clear_bit(WRITE_PENDING, &con->state);
+ dout("try_write nothing else to write.\n");
+done:
+ ret = 0;
+out:
+ dout("try_write done on %p\n", con);
+ return ret;
+}
+
+
+
+/*
+ * Read what we can from the socket.
+ */
+static int try_read(struct ceph_connection *con)
+{
+ int ret = -1;
+
+ if (!con->sock)
+ return 0;
+
+ if (test_bit(STANDBY, &con->state))
+ return 0;
+
+ dout("try_read start on %p\n", con);
+
+more:
+ dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
+ con->in_base_pos);
+ if (test_bit(CONNECTING, &con->state)) {
+ if (!test_bit(NEGOTIATING, &con->state)) {
+ dout("try_read connecting\n");
+ ret = read_partial_banner(con);
+ if (ret <= 0)
+ goto done;
+ if (process_banner(con) < 0) {
+ ret = -1;
+ goto out;
+ }
+ }
+ ret = read_partial_connect(con);
+ if (ret <= 0)
+ goto done;
+ if (process_connect(con) < 0) {
+ ret = -1;
+ goto out;
+ }
+ goto more;
+ }
+
+ if (con->in_base_pos < 0) {
+ /*
+ * skipping + discarding content.
+ *
+ * FIXME: there must be a better way to do this!
+ */
+ static char buf[1024];
+ int skip = min(1024, -con->in_base_pos);
+ dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
+ ret = ceph_tcp_recvmsg(con->sock, buf, skip);
+ if (ret <= 0)
+ goto done;
+ con->in_base_pos += ret;
+ if (con->in_base_pos)
+ goto more;
+ }
+ if (con->in_tag == CEPH_MSGR_TAG_READY) {
+ /*
+ * what's next?
+ */
+ ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+ if (ret <= 0)
+ goto done;
+ dout("try_read got tag %d\n", (int)con->in_tag);
+ switch (con->in_tag) {
+ case CEPH_MSGR_TAG_MSG:
+ prepare_read_message(con);
+ break;
+ case CEPH_MSGR_TAG_ACK:
+ prepare_read_ack(con);
+ break;
+ case CEPH_MSGR_TAG_CLOSE:
+ set_bit(CLOSED, &con->state); /* fixme */
+ goto done;
+ default:
+ goto bad_tag;
+ }
+ }
+ if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+ ret = read_partial_message(con);
+ if (ret <= 0) {
+ switch (ret) {
+ case -EBADMSG:
+ con->error_msg = "bad crc";
+ ret = -EIO;
+ goto out;
+ case -EIO:
+ con->error_msg = "io error";
+ goto out;
+ default:
+ goto done;
+ }
+ }
+ if (con->in_tag == CEPH_MSGR_TAG_READY)
+ goto more;
+ process_message(con);
+ goto more;
+ }
+ if (con->in_tag == CEPH_MSGR_TAG_ACK) {
+ ret = read_partial_ack(con);
+ if (ret <= 0)
+ goto done;
+ process_ack(con);
+ goto more;
+ }
+
+done:
+ ret = 0;
+out:
+ dout("try_read done on %p\n", con);
+ return ret;
+
+bad_tag:
+ pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+ con->error_msg = "protocol error, garbage tag";
+ ret = -1;
+ goto out;
+}
+
+
+/*
+ * Atomically queue work on a connection. Bump @con reference to
+ * avoid races with connection teardown.
+ *
+ * There is some trickery going on with QUEUED and BUSY because we
+ * only want a _single_ thread operating on each connection at any
+ * point in time, but we want to use all available CPUs.
+ *
+ * The worker thread only proceeds if it can atomically set BUSY. It
+ * clears QUEUED and does it's thing. When it thinks it's done, it
+ * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
+ * (tries again to set BUSY).
+ *
+ * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
+ * try to queue work. If that fails (work is already queued, or BUSY)
+ * we give up (work also already being done or is queued) but leave QUEUED
+ * set so that the worker thread will loop if necessary.
+ */
+static void queue_con(struct ceph_connection *con)
+{
+ if (test_bit(DEAD, &con->state)) {
+ dout("queue_con %p ignoring: DEAD\n",
+ con);
+ return;
+ }
+
+ if (!con->ops->get(con)) {
+ dout("queue_con %p ref count 0\n", con);
+ return;
+ }
+
+ set_bit(QUEUED, &con->state);
+ if (test_bit(BUSY, &con->state)) {
+ dout("queue_con %p - already BUSY\n", con);
+ con->ops->put(con);
+ } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
+ dout("queue_con %p - already queued\n", con);
+ con->ops->put(con);
+ } else {
+ dout("queue_con %p\n", con);
+ }
+}
+
+/*
+ * Do some work on a connection. Drop a connection ref when we're done.
+ */
+static void con_work(struct work_struct *work)
+{
+ struct ceph_connection *con = container_of(work, struct ceph_connection,
+ work.work);
+ int backoff = 0;
+
+more:
+ if (test_and_set_bit(BUSY, &con->state) != 0) {
+ dout("con_work %p BUSY already set\n", con);
+ goto out;
+ }
+ dout("con_work %p start, clearing QUEUED\n", con);
+ clear_bit(QUEUED, &con->state);
+
+ mutex_lock(&con->mutex);
+
+ if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
+ dout("con_work CLOSED\n");
+ con_close_socket(con);
+ goto done;
+ }
+ if (test_and_clear_bit(OPENING, &con->state)) {
+ /* reopen w/ new peer */
+ dout("con_work OPENING\n");
+ con_close_socket(con);
+ }
+
+ if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
+ try_read(con) < 0 ||
+ try_write(con) < 0) {
+ mutex_unlock(&con->mutex);
+ backoff = 1;
+ ceph_fault(con); /* error/fault path */
+ goto done_unlocked;
+ }
+
+done:
+ mutex_unlock(&con->mutex);
+
+done_unlocked:
+ clear_bit(BUSY, &con->state);
+ dout("con->state=%lu\n", con->state);
+ if (test_bit(QUEUED, &con->state)) {
+ if (!backoff || test_bit(OPENING, &con->state)) {
+ dout("con_work %p QUEUED reset, looping\n", con);
+ goto more;
+ }
+ dout("con_work %p QUEUED reset, but just faulted\n", con);
+ clear_bit(QUEUED, &con->state);
+ }
+ dout("con_work %p done\n", con);
+
+out:
+ con->ops->put(con);
+}
+
+
+/*
+ * Generic error/fault handler. A retry mechanism is used with
+ * exponential backoff
+ */
+static void ceph_fault(struct ceph_connection *con)
+{
+ pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
+ dout("fault %p state %lu to peer %s\n",
+ con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
+
+ if (test_bit(LOSSYTX, &con->state)) {
+ dout("fault on LOSSYTX channel\n");
+ goto out;
+ }
+
+ mutex_lock(&con->mutex);
+ if (test_bit(CLOSED, &con->state))
+ goto out_unlock;
+
+ con_close_socket(con);
+
+ if (con->in_msg) {
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ }
+
+ /* Requeue anything that hasn't been acked */
+ list_splice_init(&con->out_sent, &con->out_queue);
+
+ /* If there are no messages in the queue, place the connection
+ * in a STANDBY state (i.e., don't try to reconnect just yet). */
+ if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
+ dout("fault setting STANDBY\n");
+ set_bit(STANDBY, &con->state);
+ } else {
+ /* retry after a delay. */
+ if (con->delay == 0)
+ con->delay = BASE_DELAY_INTERVAL;
+ else if (con->delay < MAX_DELAY_INTERVAL)
+ con->delay *= 2;
+ dout("fault queueing %p delay %lu\n", con, con->delay);
+ con->ops->get(con);
+ if (queue_delayed_work(ceph_msgr_wq, &con->work,
+ round_jiffies_relative(con->delay)) == 0)
+ con->ops->put(con);
+ }
+
+out_unlock:
+ mutex_unlock(&con->mutex);
+out:
+ /*
+ * in case we faulted due to authentication, invalidate our
+ * current tickets so that we can get new ones.
+ */
+ if (con->auth_retry && con->ops->invalidate_authorizer) {
+ dout("calling invalidate_authorizer()\n");
+ con->ops->invalidate_authorizer(con);
+ }
+
+ if (con->ops->fault)
+ con->ops->fault(con);
+}
+
+
+
+/*
+ * create a new messenger instance
+ */
+struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
+ u32 supported_features,
+ u32 required_features)
+{
+ struct ceph_messenger *msgr;
+
+ msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
+ if (msgr == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ msgr->supported_features = supported_features;
+ msgr->required_features = required_features;
+
+ spin_lock_init(&msgr->global_seq_lock);
+
+ /* the zero page is needed if a request is "canceled" while the message
+ * is being written over the socket */
+ msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
+ if (!msgr->zero_page) {
+ kfree(msgr);
+ return ERR_PTR(-ENOMEM);
+ }
+ kmap(msgr->zero_page);
+
+ if (myaddr)
+ msgr->inst.addr = *myaddr;
+
+ /* select a random nonce */
+ msgr->inst.addr.type = 0;
+ get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
+ encode_my_addr(msgr);
+
+ dout("messenger_create %p\n", msgr);
+ return msgr;
+}
+EXPORT_SYMBOL(ceph_messenger_create);
+
+void ceph_messenger_destroy(struct ceph_messenger *msgr)
+{
+ dout("destroy %p\n", msgr);
+ kunmap(msgr->zero_page);
+ __free_page(msgr->zero_page);
+ kfree(msgr);
+ dout("destroyed messenger %p\n", msgr);
+}
+EXPORT_SYMBOL(ceph_messenger_destroy);
+
+/*
+ * Queue up an outgoing message on the given connection.
+ */
+void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ if (test_bit(CLOSED, &con->state)) {
+ dout("con_send %p closed, dropping %p\n", con, msg);
+ ceph_msg_put(msg);
+ return;
+ }
+
+ /* set src+dst */
+ msg->hdr.src = con->msgr->inst.name;
+
+ BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
+
+ msg->needs_out_seq = true;
+
+ /* queue */
+ mutex_lock(&con->mutex);
+ BUG_ON(!list_empty(&msg->list_head));
+ list_add_tail(&msg->list_head, &con->out_queue);
+ dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
+ ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
+ ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+ le32_to_cpu(msg->hdr.front_len),
+ le32_to_cpu(msg->hdr.middle_len),
+ le32_to_cpu(msg->hdr.data_len));
+ mutex_unlock(&con->mutex);
+
+ /* if there wasn't anything waiting to send before, queue
+ * new work */
+ if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+ queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_send);
+
+/*
+ * Revoke a message that was previously queued for send
+ */
+void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ mutex_lock(&con->mutex);
+ if (!list_empty(&msg->list_head)) {
+ dout("con_revoke %p msg %p - was on queue\n", con, msg);
+ list_del_init(&msg->list_head);
+ ceph_msg_put(msg);
+ msg->hdr.seq = 0;
+ }
+ if (con->out_msg == msg) {
+ dout("con_revoke %p msg %p - was sending\n", con, msg);
+ con->out_msg = NULL;
+ if (con->out_kvec_is_msg) {
+ con->out_skip = con->out_kvec_bytes;
+ con->out_kvec_is_msg = false;
+ }
+ ceph_msg_put(msg);
+ msg->hdr.seq = 0;
+ }
+ mutex_unlock(&con->mutex);
+}
+
+/*
+ * Revoke a message that we may be reading data into
+ */
+void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ mutex_lock(&con->mutex);
+ if (con->in_msg && con->in_msg == msg) {
+ unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
+ unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
+ unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
+
+ /* skip rest of message */
+ dout("con_revoke_pages %p msg %p revoked\n", con, msg);
+ con->in_base_pos = con->in_base_pos -
+ sizeof(struct ceph_msg_header) -
+ front_len -
+ middle_len -
+ data_len -
+ sizeof(struct ceph_msg_footer);
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ con->in_tag = CEPH_MSGR_TAG_READY;
+ con->in_seq++;
+ } else {
+ dout("con_revoke_pages %p msg %p pages %p no-op\n",
+ con, con->in_msg, msg);
+ }
+ mutex_unlock(&con->mutex);
+}
+
+/*
+ * Queue a keepalive byte to ensure the tcp connection is alive.
+ */
+void ceph_con_keepalive(struct ceph_connection *con)
+{
+ if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
+ test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+ queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_keepalive);
+
+
+/*
+ * construct a new message with given type, size
+ * the new msg has a ref count of 1.
+ */
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
+{
+ struct ceph_msg *m;
+
+ m = kmalloc(sizeof(*m), flags);
+ if (m == NULL)
+ goto out;
+ kref_init(&m->kref);
+ INIT_LIST_HEAD(&m->list_head);
+
+ m->hdr.tid = 0;
+ m->hdr.type = cpu_to_le16(type);
+ m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+ m->hdr.version = 0;
+ m->hdr.front_len = cpu_to_le32(front_len);
+ m->hdr.middle_len = 0;
+ m->hdr.data_len = 0;
+ m->hdr.data_off = 0;
+ m->hdr.reserved = 0;
+ m->footer.front_crc = 0;
+ m->footer.middle_crc = 0;
+ m->footer.data_crc = 0;
+ m->footer.flags = 0;
+ m->front_max = front_len;
+ m->front_is_vmalloc = false;
+ m->more_to_follow = false;
+ m->pool = NULL;
+
+ /* front */
+ if (front_len) {
+ if (front_len > PAGE_CACHE_SIZE) {
+ m->front.iov_base = __vmalloc(front_len, flags,
+ PAGE_KERNEL);
+ m->front_is_vmalloc = true;
+ } else {
+ m->front.iov_base = kmalloc(front_len, flags);
+ }
+ if (m->front.iov_base == NULL) {
+ pr_err("msg_new can't allocate %d bytes\n",
+ front_len);
+ goto out2;
+ }
+ } else {
+ m->front.iov_base = NULL;
+ }
+ m->front.iov_len = front_len;
+
+ /* middle */
+ m->middle = NULL;
+
+ /* data */
+ m->nr_pages = 0;
+ m->pages = NULL;
+ m->pagelist = NULL;
+ m->bio = NULL;
+ m->bio_iter = NULL;
+ m->bio_seg = 0;
+ m->trail = NULL;
+
+ dout("ceph_msg_new %p front %d\n", m, front_len);
+ return m;
+
+out2:
+ ceph_msg_put(m);
+out:
+ pr_err("msg_new can't create type %d front %d\n", type, front_len);
+ return NULL;
+}
+EXPORT_SYMBOL(ceph_msg_new);
+
+/*
+ * Allocate "middle" portion of a message, if it is needed and wasn't
+ * allocated by alloc_msg. This allows us to read a small fixed-size
+ * per-type header in the front and then gracefully fail (i.e.,
+ * propagate the error to the caller based on info in the front) when
+ * the middle is too large.
+ */
+static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ int type = le16_to_cpu(msg->hdr.type);
+ int middle_len = le32_to_cpu(msg->hdr.middle_len);
+
+ dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
+ ceph_msg_type_name(type), middle_len);
+ BUG_ON(!middle_len);
+ BUG_ON(msg->middle);
+
+ msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
+ if (!msg->middle)
+ return -ENOMEM;
+ return 0;
+}
+
+/*
+ * Generic message allocator, for incoming messages.
+ */
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip)
+{
+ int type = le16_to_cpu(hdr->type);
+ int front_len = le32_to_cpu(hdr->front_len);
+ int middle_len = le32_to_cpu(hdr->middle_len);
+ struct ceph_msg *msg = NULL;
+ int ret;
+
+ if (con->ops->alloc_msg) {
+ mutex_unlock(&con->mutex);
+ msg = con->ops->alloc_msg(con, hdr, skip);
+ mutex_lock(&con->mutex);
+ if (!msg || *skip)
+ return NULL;
+ }
+ if (!msg) {
+ *skip = 0;
+ msg = ceph_msg_new(type, front_len, GFP_NOFS);
+ if (!msg) {
+ pr_err("unable to allocate msg type %d len %d\n",
+ type, front_len);
+ return NULL;
+ }
+ }
+ memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
+
+ if (middle_len && !msg->middle) {
+ ret = ceph_alloc_middle(con, msg);
+ if (ret < 0) {
+ ceph_msg_put(msg);
+ return NULL;
+ }
+ }
+
+ return msg;
+}
+
+
+/*
+ * Free a generically kmalloc'd message.
+ */
+void ceph_msg_kfree(struct ceph_msg *m)
+{
+ dout("msg_kfree %p\n", m);
+ if (m->front_is_vmalloc)
+ vfree(m->front.iov_base);
+ else
+ kfree(m->front.iov_base);
+ kfree(m);
+}
+
+/*
+ * Drop a msg ref. Destroy as needed.
+ */
+void ceph_msg_last_put(struct kref *kref)
+{
+ struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
+
+ dout("ceph_msg_put last one on %p\n", m);
+ WARN_ON(!list_empty(&m->list_head));
+
+ /* drop middle, data, if any */
+ if (m->middle) {
+ ceph_buffer_put(m->middle);
+ m->middle = NULL;
+ }
+ m->nr_pages = 0;
+ m->pages = NULL;
+
+ if (m->pagelist) {
+ ceph_pagelist_release(m->pagelist);
+ kfree(m->pagelist);
+ m->pagelist = NULL;
+ }
+
+ m->trail = NULL;
+
+ if (m->pool)
+ ceph_msgpool_put(m->pool, m);
+ else
+ ceph_msg_kfree(m);
+}
+EXPORT_SYMBOL(ceph_msg_last_put);
+
+void ceph_msg_dump(struct ceph_msg *msg)
+{
+ pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
+ msg->front_max, msg->nr_pages);
+ print_hex_dump(KERN_DEBUG, "header: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ &msg->hdr, sizeof(msg->hdr), true);
+ print_hex_dump(KERN_DEBUG, " front: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ msg->front.iov_base, msg->front.iov_len, true);
+ if (msg->middle)
+ print_hex_dump(KERN_DEBUG, "middle: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ msg->middle->vec.iov_base,
+ msg->middle->vec.iov_len, true);
+ print_hex_dump(KERN_DEBUG, "footer: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ &msg->footer, sizeof(msg->footer), true);
+}
+EXPORT_SYMBOL(ceph_msg_dump);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
new file mode 100644
index 00000000000..8a079399174
--- /dev/null
+++ b/net/ceph/mon_client.c
@@ -0,0 +1,1027 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/decode.h>
+
+#include <linux/ceph/auth.h>
+
+/*
+ * Interact with Ceph monitor cluster. Handle requests for new map
+ * versions, and periodically resend as needed. Also implement
+ * statfs() and umount().
+ *
+ * A small cluster of Ceph "monitors" are responsible for managing critical
+ * cluster configuration and state information. An odd number (e.g., 3, 5)
+ * of cmon daemons use a modified version of the Paxos part-time parliament
+ * algorithm to manage the MDS map (mds cluster membership), OSD map, and
+ * list of clients who have mounted the file system.
+ *
+ * We maintain an open, active session with a monitor at all times in order to
+ * receive timely MDSMap updates. We periodically send a keepalive byte on the
+ * TCP socket to ensure we detect a failure. If the connection does break, we
+ * randomly hunt for a new monitor. Once the connection is reestablished, we
+ * resend any outstanding requests.
+ */
+
+static const struct ceph_connection_operations mon_con_ops;
+
+static int __validate_auth(struct ceph_mon_client *monc);
+
+/*
+ * Decode a monmap blob (e.g., during mount).
+ */
+struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+{
+ struct ceph_monmap *m = NULL;
+ int i, err = -EINVAL;
+ struct ceph_fsid fsid;
+ u32 epoch, num_mon;
+ u16 version;
+ u32 len;
+
+ ceph_decode_32_safe(&p, end, len, bad);
+ ceph_decode_need(&p, end, len, bad);
+
+ dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
+
+ ceph_decode_16_safe(&p, end, version, bad);
+
+ ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
+ ceph_decode_copy(&p, &fsid, sizeof(fsid));
+ epoch = ceph_decode_32(&p);
+
+ num_mon = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
+
+ if (num_mon >= CEPH_MAX_MON)
+ goto bad;
+ m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
+ if (m == NULL)
+ return ERR_PTR(-ENOMEM);
+ m->fsid = fsid;
+ m->epoch = epoch;
+ m->num_mon = num_mon;
+ ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
+ for (i = 0; i < num_mon; i++)
+ ceph_decode_addr(&m->mon_inst[i].addr);
+
+ dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
+ m->num_mon);
+ for (i = 0; i < m->num_mon; i++)
+ dout("monmap_decode mon%d is %s\n", i,
+ ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
+ return m;
+
+bad:
+ dout("monmap_decode failed with %d\n", err);
+ kfree(m);
+ return ERR_PTR(err);
+}
+
+/*
+ * return true if *addr is included in the monmap.
+ */
+int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
+{
+ int i;
+
+ for (i = 0; i < m->num_mon; i++)
+ if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
+ return 1;
+ return 0;
+}
+
+/*
+ * Send an auth request.
+ */
+static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
+{
+ monc->pending_auth = 1;
+ monc->m_auth->front.iov_len = len;
+ monc->m_auth->hdr.front_len = cpu_to_le32(len);
+ ceph_con_revoke(monc->con, monc->m_auth);
+ ceph_msg_get(monc->m_auth); /* keep our ref */
+ ceph_con_send(monc->con, monc->m_auth);
+}
+
+/*
+ * Close monitor session, if any.
+ */
+static void __close_session(struct ceph_mon_client *monc)
+{
+ if (monc->con) {
+ dout("__close_session closing mon%d\n", monc->cur_mon);
+ ceph_con_revoke(monc->con, monc->m_auth);
+ ceph_con_close(monc->con);
+ monc->cur_mon = -1;
+ monc->pending_auth = 0;
+ ceph_auth_reset(monc->auth);
+ }
+}
+
+/*
+ * Open a session with a (new) monitor.
+ */
+static int __open_session(struct ceph_mon_client *monc)
+{
+ char r;
+ int ret;
+
+ if (monc->cur_mon < 0) {
+ get_random_bytes(&r, 1);
+ monc->cur_mon = r % monc->monmap->num_mon;
+ dout("open_session num=%d r=%d -> mon%d\n",
+ monc->monmap->num_mon, r, monc->cur_mon);
+ monc->sub_sent = 0;
+ monc->sub_renew_after = jiffies; /* i.e., expired */
+ monc->want_next_osdmap = !!monc->want_next_osdmap;
+
+ dout("open_session mon%d opening\n", monc->cur_mon);
+ monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
+ monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
+ ceph_con_open(monc->con,
+ &monc->monmap->mon_inst[monc->cur_mon].addr);
+
+ /* initiatiate authentication handshake */
+ ret = ceph_auth_build_hello(monc->auth,
+ monc->m_auth->front.iov_base,
+ monc->m_auth->front_max);
+ __send_prepared_auth_request(monc, ret);
+ } else {
+ dout("open_session mon%d already open\n", monc->cur_mon);
+ }
+ return 0;
+}
+
+static bool __sub_expired(struct ceph_mon_client *monc)
+{
+ return time_after_eq(jiffies, monc->sub_renew_after);
+}
+
+/*
+ * Reschedule delayed work timer.
+ */
+static void __schedule_delayed(struct ceph_mon_client *monc)
+{
+ unsigned delay;
+
+ if (monc->cur_mon < 0 || __sub_expired(monc))
+ delay = 10 * HZ;
+ else
+ delay = 20 * HZ;
+ dout("__schedule_delayed after %u\n", delay);
+ schedule_delayed_work(&monc->delayed_work, delay);
+}
+
+/*
+ * Send subscribe request for mdsmap and/or osdmap.
+ */
+static void __send_subscribe(struct ceph_mon_client *monc)
+{
+ dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
+ (unsigned)monc->sub_sent, __sub_expired(monc),
+ monc->want_next_osdmap);
+ if ((__sub_expired(monc) && !monc->sub_sent) ||
+ monc->want_next_osdmap == 1) {
+ struct ceph_msg *msg = monc->m_subscribe;
+ struct ceph_mon_subscribe_item *i;
+ void *p, *end;
+ int num;
+
+ p = msg->front.iov_base;
+ end = p + msg->front_max;
+
+ num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
+ ceph_encode_32(&p, num);
+
+ if (monc->want_next_osdmap) {
+ dout("__send_subscribe to 'osdmap' %u\n",
+ (unsigned)monc->have_osdmap);
+ ceph_encode_string(&p, end, "osdmap", 6);
+ i = p;
+ i->have = cpu_to_le64(monc->have_osdmap);
+ i->onetime = 1;
+ p += sizeof(*i);
+ monc->want_next_osdmap = 2; /* requested */
+ }
+ if (monc->want_mdsmap) {
+ dout("__send_subscribe to 'mdsmap' %u+\n",
+ (unsigned)monc->have_mdsmap);
+ ceph_encode_string(&p, end, "mdsmap", 6);
+ i = p;
+ i->have = cpu_to_le64(monc->have_mdsmap);
+ i->onetime = 0;
+ p += sizeof(*i);
+ }
+ ceph_encode_string(&p, end, "monmap", 6);
+ i = p;
+ i->have = 0;
+ i->onetime = 0;
+ p += sizeof(*i);
+
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ ceph_con_revoke(monc->con, msg);
+ ceph_con_send(monc->con, ceph_msg_get(msg));
+
+ monc->sub_sent = jiffies | 1; /* never 0 */
+ }
+}
+
+static void handle_subscribe_ack(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ unsigned seconds;
+ struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
+
+ if (msg->front.iov_len < sizeof(*h))
+ goto bad;
+ seconds = le32_to_cpu(h->duration);
+
+ mutex_lock(&monc->mutex);
+ if (monc->hunting) {
+ pr_info("mon%d %s session established\n",
+ monc->cur_mon,
+ ceph_pr_addr(&monc->con->peer_addr.in_addr));
+ monc->hunting = false;
+ }
+ dout("handle_subscribe_ack after %d seconds\n", seconds);
+ monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
+ monc->sub_sent = 0;
+ mutex_unlock(&monc->mutex);
+ return;
+bad:
+ pr_err("got corrupt subscribe-ack msg\n");
+ ceph_msg_dump(msg);
+}
+
+/*
+ * Keep track of which maps we have
+ */
+int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
+{
+ mutex_lock(&monc->mutex);
+ monc->have_mdsmap = got;
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_monc_got_mdsmap);
+
+int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
+{
+ mutex_lock(&monc->mutex);
+ monc->have_osdmap = got;
+ monc->want_next_osdmap = 0;
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+
+/*
+ * Register interest in the next osdmap
+ */
+void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+{
+ dout("request_next_osdmap have %u\n", monc->have_osdmap);
+ mutex_lock(&monc->mutex);
+ if (!monc->want_next_osdmap)
+ monc->want_next_osdmap = 1;
+ if (monc->want_next_osdmap < 2)
+ __send_subscribe(monc);
+ mutex_unlock(&monc->mutex);
+}
+
+/*
+ *
+ */
+int ceph_monc_open_session(struct ceph_mon_client *monc)
+{
+ if (!monc->con) {
+ monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
+ if (!monc->con)
+ return -ENOMEM;
+ ceph_con_init(monc->client->msgr, monc->con);
+ monc->con->private = monc;
+ monc->con->ops = &mon_con_ops;
+ }
+
+ mutex_lock(&monc->mutex);
+ __open_session(monc);
+ __schedule_delayed(monc);
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_monc_open_session);
+
+/*
+ * The monitor responds with mount ack indicate mount success. The
+ * included client ticket allows the client to talk to MDSs and OSDs.
+ */
+static void ceph_monc_handle_map(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ struct ceph_client *client = monc->client;
+ struct ceph_monmap *monmap = NULL, *old = monc->monmap;
+ void *p, *end;
+
+ mutex_lock(&monc->mutex);
+
+ dout("handle_monmap\n");
+ p = msg->front.iov_base;
+ end = p + msg->front.iov_len;
+
+ monmap = ceph_monmap_decode(p, end);
+ if (IS_ERR(monmap)) {
+ pr_err("problem decoding monmap, %d\n",
+ (int)PTR_ERR(monmap));
+ goto out;
+ }
+
+ if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
+ kfree(monmap);
+ goto out;
+ }
+
+ client->monc.monmap = monmap;
+ kfree(old);
+
+out:
+ mutex_unlock(&monc->mutex);
+ wake_up_all(&client->auth_wq);
+}
+
+/*
+ * generic requests (e.g., statfs, poolop)
+ */
+static struct ceph_mon_generic_request *__lookup_generic_req(
+ struct ceph_mon_client *monc, u64 tid)
+{
+ struct ceph_mon_generic_request *req;
+ struct rb_node *n = monc->generic_request_tree.rb_node;
+
+ while (n) {
+ req = rb_entry(n, struct ceph_mon_generic_request, node);
+ if (tid < req->tid)
+ n = n->rb_left;
+ else if (tid > req->tid)
+ n = n->rb_right;
+ else
+ return req;
+ }
+ return NULL;
+}
+
+static void __insert_generic_request(struct ceph_mon_client *monc,
+ struct ceph_mon_generic_request *new)
+{
+ struct rb_node **p = &monc->generic_request_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_mon_generic_request *req = NULL;
+
+ while (*p) {
+ parent = *p;
+ req = rb_entry(parent, struct ceph_mon_generic_request, node);
+ if (new->tid < req->tid)
+ p = &(*p)->rb_left;
+ else if (new->tid > req->tid)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&new->node, parent, p);
+ rb_insert_color(&new->node, &monc->generic_request_tree);
+}
+
+static void release_generic_request(struct kref *kref)
+{
+ struct ceph_mon_generic_request *req =
+ container_of(kref, struct ceph_mon_generic_request, kref);
+
+ if (req->reply)
+ ceph_msg_put(req->reply);
+ if (req->request)
+ ceph_msg_put(req->request);
+
+ kfree(req);
+}
+
+static void put_generic_request(struct ceph_mon_generic_request *req)
+{
+ kref_put(&req->kref, release_generic_request);
+}
+
+static void get_generic_request(struct ceph_mon_generic_request *req)
+{
+ kref_get(&req->kref);
+}
+
+static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip)
+{
+ struct ceph_mon_client *monc = con->private;
+ struct ceph_mon_generic_request *req;
+ u64 tid = le64_to_cpu(hdr->tid);
+ struct ceph_msg *m;
+
+ mutex_lock(&monc->mutex);
+ req = __lookup_generic_req(monc, tid);
+ if (!req) {
+ dout("get_generic_reply %lld dne\n", tid);
+ *skip = 1;
+ m = NULL;
+ } else {
+ dout("get_generic_reply %lld got %p\n", tid, req->reply);
+ m = ceph_msg_get(req->reply);
+ /*
+ * we don't need to track the connection reading into
+ * this reply because we only have one open connection
+ * at a time, ever.
+ */
+ }
+ mutex_unlock(&monc->mutex);
+ return m;
+}
+
+static int do_generic_request(struct ceph_mon_client *monc,
+ struct ceph_mon_generic_request *req)
+{
+ int err;
+
+ /* register request */
+ mutex_lock(&monc->mutex);
+ req->tid = ++monc->last_tid;
+ req->request->hdr.tid = cpu_to_le64(req->tid);
+ __insert_generic_request(monc, req);
+ monc->num_generic_requests++;
+ ceph_con_send(monc->con, ceph_msg_get(req->request));
+ mutex_unlock(&monc->mutex);
+
+ err = wait_for_completion_interruptible(&req->completion);
+
+ mutex_lock(&monc->mutex);
+ rb_erase(&req->node, &monc->generic_request_tree);
+ monc->num_generic_requests--;
+ mutex_unlock(&monc->mutex);
+
+ if (!err)
+ err = req->result;
+ return err;
+}
+
+/*
+ * statfs
+ */
+static void handle_statfs_reply(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ struct ceph_mon_generic_request *req;
+ struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
+ u64 tid = le64_to_cpu(msg->hdr.tid);
+
+ if (msg->front.iov_len != sizeof(*reply))
+ goto bad;
+ dout("handle_statfs_reply %p tid %llu\n", msg, tid);
+
+ mutex_lock(&monc->mutex);
+ req = __lookup_generic_req(monc, tid);
+ if (req) {
+ *(struct ceph_statfs *)req->buf = reply->st;
+ req->result = 0;
+ get_generic_request(req);
+ }
+ mutex_unlock(&monc->mutex);
+ if (req) {
+ complete_all(&req->completion);
+ put_generic_request(req);
+ }
+ return;
+
+bad:
+ pr_err("corrupt generic reply, tid %llu\n", tid);
+ ceph_msg_dump(msg);
+}
+
+/*
+ * Do a synchronous statfs().
+ */
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
+{
+ struct ceph_mon_generic_request *req;
+ struct ceph_mon_statfs *h;
+ int err;
+
+ req = kzalloc(sizeof(*req), GFP_NOFS);
+ if (!req)
+ return -ENOMEM;
+
+ kref_init(&req->kref);
+ req->buf = buf;
+ req->buf_len = sizeof(*buf);
+ init_completion(&req->completion);
+
+ err = -ENOMEM;
+ req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
+ if (!req->request)
+ goto out;
+ req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
+ if (!req->reply)
+ goto out;
+
+ /* fill out request */
+ h = req->request->front.iov_base;
+ h->monhdr.have_version = 0;
+ h->monhdr.session_mon = cpu_to_le16(-1);
+ h->monhdr.session_mon_tid = 0;
+ h->fsid = monc->monmap->fsid;
+
+ err = do_generic_request(monc, req);
+
+out:
+ kref_put(&req->kref, release_generic_request);
+ return err;
+}
+EXPORT_SYMBOL(ceph_monc_do_statfs);
+
+/*
+ * pool ops
+ */
+static int get_poolop_reply_buf(const char *src, size_t src_len,
+ char *dst, size_t dst_len)
+{
+ u32 buf_len;
+
+ if (src_len != sizeof(u32) + dst_len)
+ return -EINVAL;
+
+ buf_len = le32_to_cpu(*(u32 *)src);
+ if (buf_len != dst_len)
+ return -EINVAL;
+
+ memcpy(dst, src + sizeof(u32), dst_len);
+ return 0;
+}
+
+static void handle_poolop_reply(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ struct ceph_mon_generic_request *req;
+ struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
+ u64 tid = le64_to_cpu(msg->hdr.tid);
+
+ if (msg->front.iov_len < sizeof(*reply))
+ goto bad;
+ dout("handle_poolop_reply %p tid %llu\n", msg, tid);
+
+ mutex_lock(&monc->mutex);
+ req = __lookup_generic_req(monc, tid);
+ if (req) {
+ if (req->buf_len &&
+ get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
+ msg->front.iov_len - sizeof(*reply),
+ req->buf, req->buf_len) < 0) {
+ mutex_unlock(&monc->mutex);
+ goto bad;
+ }
+ req->result = le32_to_cpu(reply->reply_code);
+ get_generic_request(req);
+ }
+ mutex_unlock(&monc->mutex);
+ if (req) {
+ complete(&req->completion);
+ put_generic_request(req);
+ }
+ return;
+
+bad:
+ pr_err("corrupt generic reply, tid %llu\n", tid);
+ ceph_msg_dump(msg);
+}
+
+/*
+ * Do a synchronous pool op.
+ */
+int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
+ u32 pool, u64 snapid,
+ char *buf, int len)
+{
+ struct ceph_mon_generic_request *req;
+ struct ceph_mon_poolop *h;
+ int err;
+
+ req = kzalloc(sizeof(*req), GFP_NOFS);
+ if (!req)
+ return -ENOMEM;
+
+ kref_init(&req->kref);
+ req->buf = buf;
+ req->buf_len = len;
+ init_completion(&req->completion);
+
+ err = -ENOMEM;
+ req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
+ if (!req->request)
+ goto out;
+ req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
+ if (!req->reply)
+ goto out;
+
+ /* fill out request */
+ req->request->hdr.version = cpu_to_le16(2);
+ h = req->request->front.iov_base;
+ h->monhdr.have_version = 0;
+ h->monhdr.session_mon = cpu_to_le16(-1);
+ h->monhdr.session_mon_tid = 0;
+ h->fsid = monc->monmap->fsid;
+ h->pool = cpu_to_le32(pool);
+ h->op = cpu_to_le32(op);
+ h->auid = 0;
+ h->snapid = cpu_to_le64(snapid);
+ h->name_len = 0;
+
+ err = do_generic_request(monc, req);
+
+out:
+ kref_put(&req->kref, release_generic_request);
+ return err;
+}
+
+int ceph_monc_create_snapid(struct ceph_mon_client *monc,
+ u32 pool, u64 *snapid)
+{
+ return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
+ pool, 0, (char *)snapid, sizeof(*snapid));
+
+}
+EXPORT_SYMBOL(ceph_monc_create_snapid);
+
+int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
+ u32 pool, u64 snapid)
+{
+ return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
+ pool, snapid, 0, 0);
+
+}
+
+/*
+ * Resend pending generic requests.
+ */
+static void __resend_generic_request(struct ceph_mon_client *monc)
+{
+ struct ceph_mon_generic_request *req;
+ struct rb_node *p;
+
+ for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
+ req = rb_entry(p, struct ceph_mon_generic_request, node);
+ ceph_con_revoke(monc->con, req->request);
+ ceph_con_send(monc->con, ceph_msg_get(req->request));
+ }
+}
+
+/*
+ * Delayed work. If we haven't mounted yet, retry. Otherwise,
+ * renew/retry subscription as needed (in case it is timing out, or we
+ * got an ENOMEM). And keep the monitor connection alive.
+ */
+static void delayed_work(struct work_struct *work)
+{
+ struct ceph_mon_client *monc =
+ container_of(work, struct ceph_mon_client, delayed_work.work);
+
+ dout("monc delayed_work\n");
+ mutex_lock(&monc->mutex);
+ if (monc->hunting) {
+ __close_session(monc);
+ __open_session(monc); /* continue hunting */
+ } else {
+ ceph_con_keepalive(monc->con);
+
+ __validate_auth(monc);
+
+ if (monc->auth->ops->is_authenticated(monc->auth))
+ __send_subscribe(monc);
+ }
+ __schedule_delayed(monc);
+ mutex_unlock(&monc->mutex);
+}
+
+/*
+ * On startup, we build a temporary monmap populated with the IPs
+ * provided by mount(2).
+ */
+static int build_initial_monmap(struct ceph_mon_client *monc)
+{
+ struct ceph_options *opt = monc->client->options;
+ struct ceph_entity_addr *mon_addr = opt->mon_addr;
+ int num_mon = opt->num_mon;
+ int i;
+
+ /* build initial monmap */
+ monc->monmap = kzalloc(sizeof(*monc->monmap) +
+ num_mon*sizeof(monc->monmap->mon_inst[0]),
+ GFP_KERNEL);
+ if (!monc->monmap)
+ return -ENOMEM;
+ for (i = 0; i < num_mon; i++) {
+ monc->monmap->mon_inst[i].addr = mon_addr[i];
+ monc->monmap->mon_inst[i].addr.nonce = 0;
+ monc->monmap->mon_inst[i].name.type =
+ CEPH_ENTITY_TYPE_MON;
+ monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+ }
+ monc->monmap->num_mon = num_mon;
+ monc->have_fsid = false;
+ return 0;
+}
+
+int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
+{
+ int err = 0;
+
+ dout("init\n");
+ memset(monc, 0, sizeof(*monc));
+ monc->client = cl;
+ monc->monmap = NULL;
+ mutex_init(&monc->mutex);
+
+ err = build_initial_monmap(monc);
+ if (err)
+ goto out;
+
+ monc->con = NULL;
+
+ /* authentication */
+ monc->auth = ceph_auth_init(cl->options->name,
+ cl->options->secret);
+ if (IS_ERR(monc->auth))
+ return PTR_ERR(monc->auth);
+ monc->auth->want_keys =
+ CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
+ CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
+
+ /* msgs */
+ err = -ENOMEM;
+ monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
+ sizeof(struct ceph_mon_subscribe_ack),
+ GFP_NOFS);
+ if (!monc->m_subscribe_ack)
+ goto out_monmap;
+
+ monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
+ if (!monc->m_subscribe)
+ goto out_subscribe_ack;
+
+ monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
+ if (!monc->m_auth_reply)
+ goto out_subscribe;
+
+ monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
+ monc->pending_auth = 0;
+ if (!monc->m_auth)
+ goto out_auth_reply;
+
+ monc->cur_mon = -1;
+ monc->hunting = true;
+ monc->sub_renew_after = jiffies;
+ monc->sub_sent = 0;
+
+ INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
+ monc->generic_request_tree = RB_ROOT;
+ monc->num_generic_requests = 0;
+ monc->last_tid = 0;
+
+ monc->have_mdsmap = 0;
+ monc->have_osdmap = 0;
+ monc->want_next_osdmap = 1;
+ return 0;
+
+out_auth_reply:
+ ceph_msg_put(monc->m_auth_reply);
+out_subscribe:
+ ceph_msg_put(monc->m_subscribe);
+out_subscribe_ack:
+ ceph_msg_put(monc->m_subscribe_ack);
+out_monmap:
+ kfree(monc->monmap);
+out:
+ return err;
+}
+EXPORT_SYMBOL(ceph_monc_init);
+
+void ceph_monc_stop(struct ceph_mon_client *monc)
+{
+ dout("stop\n");
+ cancel_delayed_work_sync(&monc->delayed_work);
+
+ mutex_lock(&monc->mutex);
+ __close_session(monc);
+ if (monc->con) {
+ monc->con->private = NULL;
+ monc->con->ops->put(monc->con);
+ monc->con = NULL;
+ }
+ mutex_unlock(&monc->mutex);
+
+ ceph_auth_destroy(monc->auth);
+
+ ceph_msg_put(monc->m_auth);
+ ceph_msg_put(monc->m_auth_reply);
+ ceph_msg_put(monc->m_subscribe);
+ ceph_msg_put(monc->m_subscribe_ack);
+
+ kfree(monc->monmap);
+}
+EXPORT_SYMBOL(ceph_monc_stop);
+
+static void handle_auth_reply(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ int ret;
+ int was_auth = 0;
+
+ mutex_lock(&monc->mutex);
+ if (monc->auth->ops)
+ was_auth = monc->auth->ops->is_authenticated(monc->auth);
+ monc->pending_auth = 0;
+ ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+ msg->front.iov_len,
+ monc->m_auth->front.iov_base,
+ monc->m_auth->front_max);
+ if (ret < 0) {
+ monc->client->auth_err = ret;
+ wake_up_all(&monc->client->auth_wq);
+ } else if (ret > 0) {
+ __send_prepared_auth_request(monc, ret);
+ } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
+ dout("authenticated, starting session\n");
+
+ monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
+ monc->client->msgr->inst.name.num =
+ cpu_to_le64(monc->auth->global_id);
+
+ __send_subscribe(monc);
+ __resend_generic_request(monc);
+ }
+ mutex_unlock(&monc->mutex);
+}
+
+static int __validate_auth(struct ceph_mon_client *monc)
+{
+ int ret;
+
+ if (monc->pending_auth)
+ return 0;
+
+ ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
+ monc->m_auth->front_max);
+ if (ret <= 0)
+ return ret; /* either an error, or no need to authenticate */
+ __send_prepared_auth_request(monc, ret);
+ return 0;
+}
+
+int ceph_monc_validate_auth(struct ceph_mon_client *monc)
+{
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ ret = __validate_auth(monc);
+ mutex_unlock(&monc->mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_monc_validate_auth);
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct ceph_mon_client *monc = con->private;
+ int type = le16_to_cpu(msg->hdr.type);
+
+ if (!monc)
+ return;
+
+ switch (type) {
+ case CEPH_MSG_AUTH_REPLY:
+ handle_auth_reply(monc, msg);
+ break;
+
+ case CEPH_MSG_MON_SUBSCRIBE_ACK:
+ handle_subscribe_ack(monc, msg);
+ break;
+
+ case CEPH_MSG_STATFS_REPLY:
+ handle_statfs_reply(monc, msg);
+ break;
+
+ case CEPH_MSG_POOLOP_REPLY:
+ handle_poolop_reply(monc, msg);
+ break;
+
+ case CEPH_MSG_MON_MAP:
+ ceph_monc_handle_map(monc, msg);
+ break;
+
+ case CEPH_MSG_OSD_MAP:
+ ceph_osdc_handle_map(&monc->client->osdc, msg);
+ break;
+
+ default:
+ /* can the chained handler handle it? */
+ if (monc->client->extra_mon_dispatch &&
+ monc->client->extra_mon_dispatch(monc->client, msg) == 0)
+ break;
+
+ pr_err("received unknown message type %d %s\n", type,
+ ceph_msg_type_name(type));
+ }
+ ceph_msg_put(msg);
+}
+
+/*
+ * Allocate memory for incoming message
+ */
+static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip)
+{
+ struct ceph_mon_client *monc = con->private;
+ int type = le16_to_cpu(hdr->type);
+ int front_len = le32_to_cpu(hdr->front_len);
+ struct ceph_msg *m = NULL;
+
+ *skip = 0;
+
+ switch (type) {
+ case CEPH_MSG_MON_SUBSCRIBE_ACK:
+ m = ceph_msg_get(monc->m_subscribe_ack);
+ break;
+ case CEPH_MSG_POOLOP_REPLY:
+ case CEPH_MSG_STATFS_REPLY:
+ return get_generic_reply(con, hdr, skip);
+ case CEPH_MSG_AUTH_REPLY:
+ m = ceph_msg_get(monc->m_auth_reply);
+ break;
+ case CEPH_MSG_MON_MAP:
+ case CEPH_MSG_MDS_MAP:
+ case CEPH_MSG_OSD_MAP:
+ m = ceph_msg_new(type, front_len, GFP_NOFS);
+ break;
+ }
+
+ if (!m) {
+ pr_info("alloc_msg unknown type %d\n", type);
+ *skip = 1;
+ }
+ return m;
+}
+
+/*
+ * If the monitor connection resets, pick a new monitor and resubmit
+ * any pending requests.
+ */
+static void mon_fault(struct ceph_connection *con)
+{
+ struct ceph_mon_client *monc = con->private;
+
+ if (!monc)
+ return;
+
+ dout("mon_fault\n");
+ mutex_lock(&monc->mutex);
+ if (!con->private)
+ goto out;
+
+ if (monc->con && !monc->hunting)
+ pr_info("mon%d %s session lost, "
+ "hunting for new mon\n", monc->cur_mon,
+ ceph_pr_addr(&monc->con->peer_addr.in_addr));
+
+ __close_session(monc);
+ if (!monc->hunting) {
+ /* start hunting */
+ monc->hunting = true;
+ __open_session(monc);
+ } else {
+ /* already hunting, let's wait a bit */
+ __schedule_delayed(monc);
+ }
+out:
+ mutex_unlock(&monc->mutex);
+}
+
+static const struct ceph_connection_operations mon_con_ops = {
+ .get = ceph_con_get,
+ .put = ceph_con_put,
+ .dispatch = dispatch,
+ .fault = mon_fault,
+ .alloc_msg = mon_alloc_msg,
+};
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
new file mode 100644
index 00000000000..d5f2d97ac05
--- /dev/null
+++ b/net/ceph/msgpool.c
@@ -0,0 +1,64 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include <linux/ceph/msgpool.h>
+
+static void *alloc_fn(gfp_t gfp_mask, void *arg)
+{
+ struct ceph_msgpool *pool = arg;
+ void *p;
+
+ p = ceph_msg_new(0, pool->front_len, gfp_mask);
+ if (!p)
+ pr_err("msgpool %s alloc failed\n", pool->name);
+ return p;
+}
+
+static void free_fn(void *element, void *arg)
+{
+ ceph_msg_put(element);
+}
+
+int ceph_msgpool_init(struct ceph_msgpool *pool,
+ int front_len, int size, bool blocking, const char *name)
+{
+ pool->front_len = front_len;
+ pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
+ if (!pool->pool)
+ return -ENOMEM;
+ pool->name = name;
+ return 0;
+}
+
+void ceph_msgpool_destroy(struct ceph_msgpool *pool)
+{
+ mempool_destroy(pool->pool);
+}
+
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
+ int front_len)
+{
+ if (front_len > pool->front_len) {
+ pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
+ pool->name, front_len, pool->front_len);
+ WARN_ON(1);
+
+ /* try to alloc a fresh message */
+ return ceph_msg_new(0, front_len, GFP_NOFS);
+ }
+
+ return mempool_alloc(pool->pool, GFP_NOFS);
+}
+
+void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
+{
+ /* reset msg front_len; user may have changed it */
+ msg->front.iov_len = pool->front_len;
+ msg->hdr.front_len = cpu_to_le32(pool->front_len);
+
+ kref_init(&msg->kref); /* retake single ref */
+}
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
new file mode 100644
index 00000000000..79391994b3e
--- /dev/null
+++ b/net/ceph/osd_client.c
@@ -0,0 +1,1773 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#ifdef CONFIG_BLOCK
+#include <linux/bio.h>
+#endif
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/pagelist.h>
+
+#define OSD_OP_FRONT_LEN 4096
+#define OSD_OPREPLY_FRONT_LEN 512
+
+static const struct ceph_connection_operations osd_con_ops;
+static int __kick_requests(struct ceph_osd_client *osdc,
+ struct ceph_osd *kickosd);
+
+static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
+
+static int op_needs_trail(int op)
+{
+ switch (op) {
+ case CEPH_OSD_OP_GETXATTR:
+ case CEPH_OSD_OP_SETXATTR:
+ case CEPH_OSD_OP_CMPXATTR:
+ case CEPH_OSD_OP_CALL:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int op_has_extent(int op)
+{
+ return (op == CEPH_OSD_OP_READ ||
+ op == CEPH_OSD_OP_WRITE);
+}
+
+void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
+ struct ceph_file_layout *layout,
+ u64 snapid,
+ u64 off, u64 *plen, u64 *bno,
+ struct ceph_osd_request *req,
+ struct ceph_osd_req_op *op)
+{
+ struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+ u64 orig_len = *plen;
+ u64 objoff, objlen; /* extent in object */
+
+ reqhead->snapid = cpu_to_le64(snapid);
+
+ /* object extent? */
+ ceph_calc_file_object_mapping(layout, off, plen, bno,
+ &objoff, &objlen);
+ if (*plen < orig_len)
+ dout(" skipping last %llu, final file extent %llu~%llu\n",
+ orig_len - *plen, off, *plen);
+
+ if (op_has_extent(op->op)) {
+ op->extent.offset = objoff;
+ op->extent.length = objlen;
+ }
+ req->r_num_pages = calc_pages_for(off, *plen);
+ if (op->op == CEPH_OSD_OP_WRITE)
+ op->payload_len = *plen;
+
+ dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
+ *bno, objoff, objlen, req->r_num_pages);
+
+}
+EXPORT_SYMBOL(ceph_calc_raw_layout);
+
+/*
+ * Implement client access to distributed object storage cluster.
+ *
+ * All data objects are stored within a cluster/cloud of OSDs, or
+ * "object storage devices." (Note that Ceph OSDs have _nothing_ to
+ * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
+ * remote daemons serving up and coordinating consistent and safe
+ * access to storage.
+ *
+ * Cluster membership and the mapping of data objects onto storage devices
+ * are described by the osd map.
+ *
+ * We keep track of pending OSD requests (read, write), resubmit
+ * requests to different OSDs when the cluster topology/data layout
+ * change, or retry the affected requests when the communications
+ * channel with an OSD is reset.
+ */
+
+/*
+ * calculate the mapping of a file extent onto an object, and fill out the
+ * request accordingly. shorten extent as necessary if it crosses an
+ * object boundary.
+ *
+ * fill osd op in request message.
+ */
+static void calc_layout(struct ceph_osd_client *osdc,
+ struct ceph_vino vino,
+ struct ceph_file_layout *layout,
+ u64 off, u64 *plen,
+ struct ceph_osd_request *req,
+ struct ceph_osd_req_op *op)
+{
+ u64 bno;
+
+ ceph_calc_raw_layout(osdc, layout, vino.snap, off,
+ plen, &bno, req, op);
+
+ sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
+ req->r_oid_len = strlen(req->r_oid);
+}
+
+/*
+ * requests
+ */
+void ceph_osdc_release_request(struct kref *kref)
+{
+ struct ceph_osd_request *req = container_of(kref,
+ struct ceph_osd_request,
+ r_kref);
+
+ if (req->r_request)
+ ceph_msg_put(req->r_request);
+ if (req->r_reply)
+ ceph_msg_put(req->r_reply);
+ if (req->r_con_filling_msg) {
+ dout("release_request revoking pages %p from con %p\n",
+ req->r_pages, req->r_con_filling_msg);
+ ceph_con_revoke_message(req->r_con_filling_msg,
+ req->r_reply);
+ ceph_con_put(req->r_con_filling_msg);
+ }
+ if (req->r_own_pages)
+ ceph_release_page_vector(req->r_pages,
+ req->r_num_pages);
+#ifdef CONFIG_BLOCK
+ if (req->r_bio)
+ bio_put(req->r_bio);
+#endif
+ ceph_put_snap_context(req->r_snapc);
+ if (req->r_trail) {
+ ceph_pagelist_release(req->r_trail);
+ kfree(req->r_trail);
+ }
+ if (req->r_mempool)
+ mempool_free(req, req->r_osdc->req_mempool);
+ else
+ kfree(req);
+}
+EXPORT_SYMBOL(ceph_osdc_release_request);
+
+static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail)
+{
+ int i = 0;
+
+ if (needs_trail)
+ *needs_trail = 0;
+ while (ops[i].op) {
+ if (needs_trail && op_needs_trail(ops[i].op))
+ *needs_trail = 1;
+ i++;
+ }
+
+ return i;
+}
+
+struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
+ int flags,
+ struct ceph_snap_context *snapc,
+ struct ceph_osd_req_op *ops,
+ bool use_mempool,
+ gfp_t gfp_flags,
+ struct page **pages,
+ struct bio *bio)
+{
+ struct ceph_osd_request *req;
+ struct ceph_msg *msg;
+ int needs_trail;
+ int num_op = get_num_ops(ops, &needs_trail);
+ size_t msg_size = sizeof(struct ceph_osd_request_head);
+
+ msg_size += num_op*sizeof(struct ceph_osd_op);
+
+ if (use_mempool) {
+ req = mempool_alloc(osdc->req_mempool, gfp_flags);
+ memset(req, 0, sizeof(*req));
+ } else {
+ req = kzalloc(sizeof(*req), gfp_flags);
+ }
+ if (req == NULL)
+ return NULL;
+
+ req->r_osdc = osdc;
+ req->r_mempool = use_mempool;
+
+ kref_init(&req->r_kref);
+ init_completion(&req->r_completion);
+ init_completion(&req->r_safe_completion);
+ INIT_LIST_HEAD(&req->r_unsafe_item);
+ req->r_flags = flags;
+
+ WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
+
+ /* create reply message */
+ if (use_mempool)
+ msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+ else
+ msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
+ OSD_OPREPLY_FRONT_LEN, gfp_flags);
+ if (!msg) {
+ ceph_osdc_put_request(req);
+ return NULL;
+ }
+ req->r_reply = msg;
+
+ /* allocate space for the trailing data */
+ if (needs_trail) {
+ req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags);
+ if (!req->r_trail) {
+ ceph_osdc_put_request(req);
+ return NULL;
+ }
+ ceph_pagelist_init(req->r_trail);
+ }
+ /* create request message; allow space for oid */
+ msg_size += 40;
+ if (snapc)
+ msg_size += sizeof(u64) * snapc->num_snaps;
+ if (use_mempool)
+ msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
+ else
+ msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags);
+ if (!msg) {
+ ceph_osdc_put_request(req);
+ return NULL;
+ }
+
+ msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
+ memset(msg->front.iov_base, 0, msg->front.iov_len);
+
+ req->r_request = msg;
+ req->r_pages = pages;
+#ifdef CONFIG_BLOCK
+ if (bio) {
+ req->r_bio = bio;
+ bio_get(req->r_bio);
+ }
+#endif
+
+ return req;
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
+
+static void osd_req_encode_op(struct ceph_osd_request *req,
+ struct ceph_osd_op *dst,
+ struct ceph_osd_req_op *src)
+{
+ dst->op = cpu_to_le16(src->op);
+
+ switch (dst->op) {
+ case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_WRITE:
+ dst->extent.offset =
+ cpu_to_le64(src->extent.offset);
+ dst->extent.length =
+ cpu_to_le64(src->extent.length);
+ dst->extent.truncate_size =
+ cpu_to_le64(src->extent.truncate_size);
+ dst->extent.truncate_seq =
+ cpu_to_le32(src->extent.truncate_seq);
+ break;
+
+ case CEPH_OSD_OP_GETXATTR:
+ case CEPH_OSD_OP_SETXATTR:
+ case CEPH_OSD_OP_CMPXATTR:
+ BUG_ON(!req->r_trail);
+
+ dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
+ dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
+ dst->xattr.cmp_op = src->xattr.cmp_op;
+ dst->xattr.cmp_mode = src->xattr.cmp_mode;
+ ceph_pagelist_append(req->r_trail, src->xattr.name,
+ src->xattr.name_len);
+ ceph_pagelist_append(req->r_trail, src->xattr.val,
+ src->xattr.value_len);
+ break;
+ case CEPH_OSD_OP_CALL:
+ BUG_ON(!req->r_trail);
+
+ dst->cls.class_len = src->cls.class_len;
+ dst->cls.method_len = src->cls.method_len;
+ dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
+
+ ceph_pagelist_append(req->r_trail, src->cls.class_name,
+ src->cls.class_len);
+ ceph_pagelist_append(req->r_trail, src->cls.method_name,
+ src->cls.method_len);
+ ceph_pagelist_append(req->r_trail, src->cls.indata,
+ src->cls.indata_len);
+ break;
+ case CEPH_OSD_OP_ROLLBACK:
+ dst->snap.snapid = cpu_to_le64(src->snap.snapid);
+ break;
+ case CEPH_OSD_OP_STARTSYNC:
+ break;
+ default:
+ pr_err("unrecognized osd opcode %d\n", dst->op);
+ WARN_ON(1);
+ break;
+ }
+ dst->payload_len = cpu_to_le32(src->payload_len);
+}
+
+/*
+ * build new request AND message
+ *
+ */
+void ceph_osdc_build_request(struct ceph_osd_request *req,
+ u64 off, u64 *plen,
+ struct ceph_osd_req_op *src_ops,
+ struct ceph_snap_context *snapc,
+ struct timespec *mtime,
+ const char *oid,
+ int oid_len)
+{
+ struct ceph_msg *msg = req->r_request;
+ struct ceph_osd_request_head *head;
+ struct ceph_osd_req_op *src_op;
+ struct ceph_osd_op *op;
+ void *p;
+ int num_op = get_num_ops(src_ops, NULL);
+ size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
+ int flags = req->r_flags;
+ u64 data_len = 0;
+ int i;
+
+ head = msg->front.iov_base;
+ op = (void *)(head + 1);
+ p = (void *)(op + num_op);
+
+ req->r_snapc = ceph_get_snap_context(snapc);
+
+ head->client_inc = cpu_to_le32(1); /* always, for now. */
+ head->flags = cpu_to_le32(flags);
+ if (flags & CEPH_OSD_FLAG_WRITE)
+ ceph_encode_timespec(&head->mtime, mtime);
+ head->num_ops = cpu_to_le16(num_op);
+
+
+ /* fill in oid */
+ head->object_len = cpu_to_le32(oid_len);
+ memcpy(p, oid, oid_len);
+ p += oid_len;
+
+ src_op = src_ops;
+ while (src_op->op) {
+ osd_req_encode_op(req, op, src_op);
+ src_op++;
+ op++;
+ }
+
+ if (req->r_trail)
+ data_len += req->r_trail->length;
+
+ if (snapc) {
+ head->snap_seq = cpu_to_le64(snapc->seq);
+ head->num_snaps = cpu_to_le32(snapc->num_snaps);
+ for (i = 0; i < snapc->num_snaps; i++) {
+ put_unaligned_le64(snapc->snaps[i], p);
+ p += sizeof(u64);
+ }
+ }
+
+ if (flags & CEPH_OSD_FLAG_WRITE) {
+ req->r_request->hdr.data_off = cpu_to_le16(off);
+ req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len);
+ } else if (data_len) {
+ req->r_request->hdr.data_off = 0;
+ req->r_request->hdr.data_len = cpu_to_le32(data_len);
+ }
+
+ BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
+ msg_size = p - msg->front.iov_base;
+ msg->front.iov_len = msg_size;
+ msg->hdr.front_len = cpu_to_le32(msg_size);
+ return;
+}
+EXPORT_SYMBOL(ceph_osdc_build_request);
+
+/*
+ * build new request AND message, calculate layout, and adjust file
+ * extent as needed.
+ *
+ * if the file was recently truncated, we include information about its
+ * old and new size so that the object can be updated appropriately. (we
+ * avoid synchronously deleting truncated objects because it's slow.)
+ *
+ * if @do_sync, include a 'startsync' command so that the osd will flush
+ * data quickly.
+ */
+struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
+ struct ceph_file_layout *layout,
+ struct ceph_vino vino,
+ u64 off, u64 *plen,
+ int opcode, int flags,
+ struct ceph_snap_context *snapc,
+ int do_sync,
+ u32 truncate_seq,
+ u64 truncate_size,
+ struct timespec *mtime,
+ bool use_mempool, int num_reply)
+{
+ struct ceph_osd_req_op ops[3];
+ struct ceph_osd_request *req;
+
+ ops[0].op = opcode;
+ ops[0].extent.truncate_seq = truncate_seq;
+ ops[0].extent.truncate_size = truncate_size;
+ ops[0].payload_len = 0;
+
+ if (do_sync) {
+ ops[1].op = CEPH_OSD_OP_STARTSYNC;
+ ops[1].payload_len = 0;
+ ops[2].op = 0;
+ } else
+ ops[1].op = 0;
+
+ req = ceph_osdc_alloc_request(osdc, flags,
+ snapc, ops,
+ use_mempool,
+ GFP_NOFS, NULL, NULL);
+ if (IS_ERR(req))
+ return req;
+
+ /* calculate max write size */
+ calc_layout(osdc, vino, layout, off, plen, req, ops);
+ req->r_file_layout = *layout; /* keep a copy */
+
+ ceph_osdc_build_request(req, off, plen, ops,
+ snapc,
+ mtime,
+ req->r_oid, req->r_oid_len);
+
+ return req;
+}
+EXPORT_SYMBOL(ceph_osdc_new_request);
+
+/*
+ * We keep osd requests in an rbtree, sorted by ->r_tid.
+ */
+static void __insert_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *new)
+{
+ struct rb_node **p = &osdc->requests.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_osd_request *req = NULL;
+
+ while (*p) {
+ parent = *p;
+ req = rb_entry(parent, struct ceph_osd_request, r_node);
+ if (new->r_tid < req->r_tid)
+ p = &(*p)->rb_left;
+ else if (new->r_tid > req->r_tid)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&new->r_node, parent, p);
+ rb_insert_color(&new->r_node, &osdc->requests);
+}
+
+static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
+ u64 tid)
+{
+ struct ceph_osd_request *req;
+ struct rb_node *n = osdc->requests.rb_node;
+
+ while (n) {
+ req = rb_entry(n, struct ceph_osd_request, r_node);
+ if (tid < req->r_tid)
+ n = n->rb_left;
+ else if (tid > req->r_tid)
+ n = n->rb_right;
+ else
+ return req;
+ }
+ return NULL;
+}
+
+static struct ceph_osd_request *
+__lookup_request_ge(struct ceph_osd_client *osdc,
+ u64 tid)
+{
+ struct ceph_osd_request *req;
+ struct rb_node *n = osdc->requests.rb_node;
+
+ while (n) {
+ req = rb_entry(n, struct ceph_osd_request, r_node);
+ if (tid < req->r_tid) {
+ if (!n->rb_left)
+ return req;
+ n = n->rb_left;
+ } else if (tid > req->r_tid) {
+ n = n->rb_right;
+ } else {
+ return req;
+ }
+ }
+ return NULL;
+}
+
+
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_reset(struct ceph_connection *con)
+{
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc;
+
+ if (!osd)
+ return;
+ dout("osd_reset osd%d\n", osd->o_osd);
+ osdc = osd->o_osdc;
+ down_read(&osdc->map_sem);
+ kick_requests(osdc, osd);
+ up_read(&osdc->map_sem);
+}
+
+/*
+ * Track open sessions with osds.
+ */
+static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
+{
+ struct ceph_osd *osd;
+
+ osd = kzalloc(sizeof(*osd), GFP_NOFS);
+ if (!osd)
+ return NULL;
+
+ atomic_set(&osd->o_ref, 1);
+ osd->o_osdc = osdc;
+ INIT_LIST_HEAD(&osd->o_requests);
+ INIT_LIST_HEAD(&osd->o_osd_lru);
+ osd->o_incarnation = 1;
+
+ ceph_con_init(osdc->client->msgr, &osd->o_con);
+ osd->o_con.private = osd;
+ osd->o_con.ops = &osd_con_ops;
+ osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
+
+ INIT_LIST_HEAD(&osd->o_keepalive_item);
+ return osd;
+}
+
+static struct ceph_osd *get_osd(struct ceph_osd *osd)
+{
+ if (atomic_inc_not_zero(&osd->o_ref)) {
+ dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
+ atomic_read(&osd->o_ref));
+ return osd;
+ } else {
+ dout("get_osd %p FAIL\n", osd);
+ return NULL;
+ }
+}
+
+static void put_osd(struct ceph_osd *osd)
+{
+ dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
+ atomic_read(&osd->o_ref) - 1);
+ if (atomic_dec_and_test(&osd->o_ref)) {
+ struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
+
+ if (osd->o_authorizer)
+ ac->ops->destroy_authorizer(ac, osd->o_authorizer);
+ kfree(osd);
+ }
+}
+
+/*
+ * remove an osd from our map
+ */
+static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+ dout("__remove_osd %p\n", osd);
+ BUG_ON(!list_empty(&osd->o_requests));
+ rb_erase(&osd->o_node, &osdc->osds);
+ list_del_init(&osd->o_osd_lru);
+ ceph_con_close(&osd->o_con);
+ put_osd(osd);
+}
+
+static void __move_osd_to_lru(struct ceph_osd_client *osdc,
+ struct ceph_osd *osd)
+{
+ dout("__move_osd_to_lru %p\n", osd);
+ BUG_ON(!list_empty(&osd->o_osd_lru));
+ list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+ osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
+}
+
+static void __remove_osd_from_lru(struct ceph_osd *osd)
+{
+ dout("__remove_osd_from_lru %p\n", osd);
+ if (!list_empty(&osd->o_osd_lru))
+ list_del_init(&osd->o_osd_lru);
+}
+
+static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
+{
+ struct ceph_osd *osd, *nosd;
+
+ dout("__remove_old_osds %p\n", osdc);
+ mutex_lock(&osdc->request_mutex);
+ list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+ if (!remove_all && time_before(jiffies, osd->lru_ttl))
+ break;
+ __remove_osd(osdc, osd);
+ }
+ mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * reset osd connect
+ */
+static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+ struct ceph_osd_request *req;
+ int ret = 0;
+
+ dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
+ if (list_empty(&osd->o_requests)) {
+ __remove_osd(osdc, osd);
+ } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
+ &osd->o_con.peer_addr,
+ sizeof(osd->o_con.peer_addr)) == 0 &&
+ !ceph_con_opened(&osd->o_con)) {
+ dout(" osd addr hasn't changed and connection never opened,"
+ " letting msgr retry");
+ /* touch each r_stamp for handle_timeout()'s benfit */
+ list_for_each_entry(req, &osd->o_requests, r_osd_item)
+ req->r_stamp = jiffies;
+ ret = -EAGAIN;
+ } else {
+ ceph_con_close(&osd->o_con);
+ ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
+ osd->o_incarnation++;
+ }
+ return ret;
+}
+
+static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+{
+ struct rb_node **p = &osdc->osds.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_osd *osd = NULL;
+
+ while (*p) {
+ parent = *p;
+ osd = rb_entry(parent, struct ceph_osd, o_node);
+ if (new->o_osd < osd->o_osd)
+ p = &(*p)->rb_left;
+ else if (new->o_osd > osd->o_osd)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&new->o_node, parent, p);
+ rb_insert_color(&new->o_node, &osdc->osds);
+}
+
+static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+{
+ struct ceph_osd *osd;
+ struct rb_node *n = osdc->osds.rb_node;
+
+ while (n) {
+ osd = rb_entry(n, struct ceph_osd, o_node);
+ if (o < osd->o_osd)
+ n = n->rb_left;
+ else if (o > osd->o_osd)
+ n = n->rb_right;
+ else
+ return osd;
+ }
+ return NULL;
+}
+
+static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+{
+ schedule_delayed_work(&osdc->timeout_work,
+ osdc->client->options->osd_keepalive_timeout * HZ);
+}
+
+static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+{
+ cancel_delayed_work(&osdc->timeout_work);
+}
+
+/*
+ * Register request, assign tid. If this is the first request, set up
+ * the timeout event.
+ */
+static void register_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ mutex_lock(&osdc->request_mutex);
+ req->r_tid = ++osdc->last_tid;
+ req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+ INIT_LIST_HEAD(&req->r_req_lru_item);
+
+ dout("register_request %p tid %lld\n", req, req->r_tid);
+ __insert_request(osdc, req);
+ ceph_osdc_get_request(req);
+ osdc->num_requests++;
+
+ if (osdc->num_requests == 1) {
+ dout(" first request, scheduling timeout\n");
+ __schedule_osd_timeout(osdc);
+ }
+ mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * called under osdc->request_mutex
+ */
+static void __unregister_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+ rb_erase(&req->r_node, &osdc->requests);
+ osdc->num_requests--;
+
+ if (req->r_osd) {
+ /* make sure the original request isn't in flight. */
+ ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+
+ list_del_init(&req->r_osd_item);
+ if (list_empty(&req->r_osd->o_requests))
+ __move_osd_to_lru(osdc, req->r_osd);
+ req->r_osd = NULL;
+ }
+
+ ceph_osdc_put_request(req);
+
+ list_del_init(&req->r_req_lru_item);
+ if (osdc->num_requests == 0) {
+ dout(" no requests, canceling timeout\n");
+ __cancel_osd_timeout(osdc);
+ }
+}
+
+/*
+ * Cancel a previously queued request message
+ */
+static void __cancel_request(struct ceph_osd_request *req)
+{
+ if (req->r_sent && req->r_osd) {
+ ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+ req->r_sent = 0;
+ }
+ list_del_init(&req->r_req_lru_item);
+}
+
+/*
+ * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
+ * (as needed), and set the request r_osd appropriately. If there is
+ * no up osd, set r_osd to NULL.
+ *
+ * Return 0 if unchanged, 1 if changed, or negative on error.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static int __map_osds(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+ struct ceph_pg pgid;
+ int acting[CEPH_PG_MAX_SIZE];
+ int o = -1, num = 0;
+ int err;
+
+ dout("map_osds %p tid %lld\n", req, req->r_tid);
+ err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
+ &req->r_file_layout, osdc->osdmap);
+ if (err)
+ return err;
+ pgid = reqhead->layout.ol_pgid;
+ req->r_pgid = pgid;
+
+ err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
+ if (err > 0) {
+ o = acting[0];
+ num = err;
+ }
+
+ if ((req->r_osd && req->r_osd->o_osd == o &&
+ req->r_sent >= req->r_osd->o_incarnation &&
+ req->r_num_pg_osds == num &&
+ memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
+ (req->r_osd == NULL && o == -1))
+ return 0; /* no change */
+
+ dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
+ req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
+ req->r_osd ? req->r_osd->o_osd : -1);
+
+ /* record full pg acting set */
+ memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
+ req->r_num_pg_osds = num;
+
+ if (req->r_osd) {
+ __cancel_request(req);
+ list_del_init(&req->r_osd_item);
+ req->r_osd = NULL;
+ }
+
+ req->r_osd = __lookup_osd(osdc, o);
+ if (!req->r_osd && o >= 0) {
+ err = -ENOMEM;
+ req->r_osd = create_osd(osdc);
+ if (!req->r_osd)
+ goto out;
+
+ dout("map_osds osd %p is osd%d\n", req->r_osd, o);
+ req->r_osd->o_osd = o;
+ req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
+ __insert_osd(osdc, req->r_osd);
+
+ ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
+ }
+
+ if (req->r_osd) {
+ __remove_osd_from_lru(req->r_osd);
+ list_add(&req->r_osd_item, &req->r_osd->o_requests);
+ }
+ err = 1; /* osd or pg changed */
+
+out:
+ return err;
+}
+
+/*
+ * caller should hold map_sem (for read) and request_mutex
+ */
+static int __send_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ struct ceph_osd_request_head *reqhead;
+ int err;
+
+ err = __map_osds(osdc, req);
+ if (err < 0)
+ return err;
+ if (req->r_osd == NULL) {
+ dout("send_request %p no up osds in pg\n", req);
+ ceph_monc_request_next_osdmap(&osdc->client->monc);
+ return 0;
+ }
+
+ dout("send_request %p tid %llu to osd%d flags %d\n",
+ req, req->r_tid, req->r_osd->o_osd, req->r_flags);
+
+ reqhead = req->r_request->front.iov_base;
+ reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
+ reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
+ reqhead->reassert_version = req->r_reassert_version;
+
+ req->r_stamp = jiffies;
+ list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+
+ ceph_msg_get(req->r_request); /* send consumes a ref */
+ ceph_con_send(&req->r_osd->o_con, req->r_request);
+ req->r_sent = req->r_osd->o_incarnation;
+ return 0;
+}
+
+/*
+ * Timeout callback, called every N seconds when 1 or more osd
+ * requests has been active for more than N seconds. When this
+ * happens, we ping all OSDs with requests who have timed out to
+ * ensure any communications channel reset is detected. Reset the
+ * request timeouts another N seconds in the future as we go.
+ * Reschedule the timeout event another N seconds in future (unless
+ * there are no open requests).
+ */
+static void handle_timeout(struct work_struct *work)
+{
+ struct ceph_osd_client *osdc =
+ container_of(work, struct ceph_osd_client, timeout_work.work);
+ struct ceph_osd_request *req, *last_req = NULL;
+ struct ceph_osd *osd;
+ unsigned long timeout = osdc->client->options->osd_timeout * HZ;
+ unsigned long keepalive =
+ osdc->client->options->osd_keepalive_timeout * HZ;
+ unsigned long last_stamp = 0;
+ struct rb_node *p;
+ struct list_head slow_osds;
+
+ dout("timeout\n");
+ down_read(&osdc->map_sem);
+
+ ceph_monc_request_next_osdmap(&osdc->client->monc);
+
+ mutex_lock(&osdc->request_mutex);
+ for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+ req = rb_entry(p, struct ceph_osd_request, r_node);
+
+ if (req->r_resend) {
+ int err;
+
+ dout("osdc resending prev failed %lld\n", req->r_tid);
+ err = __send_request(osdc, req);
+ if (err)
+ dout("osdc failed again on %lld\n", req->r_tid);
+ else
+ req->r_resend = false;
+ continue;
+ }
+ }
+
+ /*
+ * reset osds that appear to be _really_ unresponsive. this
+ * is a failsafe measure.. we really shouldn't be getting to
+ * this point if the system is working properly. the monitors
+ * should mark the osd as failed and we should find out about
+ * it from an updated osd map.
+ */
+ while (timeout && !list_empty(&osdc->req_lru)) {
+ req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
+ r_req_lru_item);
+
+ if (time_before(jiffies, req->r_stamp + timeout))
+ break;
+
+ BUG_ON(req == last_req && req->r_stamp == last_stamp);
+ last_req = req;
+ last_stamp = req->r_stamp;
+
+ osd = req->r_osd;
+ BUG_ON(!osd);
+ pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
+ req->r_tid, osd->o_osd);
+ __kick_requests(osdc, osd);
+ }
+
+ /*
+ * ping osds that are a bit slow. this ensures that if there
+ * is a break in the TCP connection we will notice, and reopen
+ * a connection with that osd (from the fault callback).
+ */
+ INIT_LIST_HEAD(&slow_osds);
+ list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
+ if (time_before(jiffies, req->r_stamp + keepalive))
+ break;
+
+ osd = req->r_osd;
+ BUG_ON(!osd);
+ dout(" tid %llu is slow, will send keepalive on osd%d\n",
+ req->r_tid, osd->o_osd);
+ list_move_tail(&osd->o_keepalive_item, &slow_osds);
+ }
+ while (!list_empty(&slow_osds)) {
+ osd = list_entry(slow_osds.next, struct ceph_osd,
+ o_keepalive_item);
+ list_del_init(&osd->o_keepalive_item);
+ ceph_con_keepalive(&osd->o_con);
+ }
+
+ __schedule_osd_timeout(osdc);
+ mutex_unlock(&osdc->request_mutex);
+
+ up_read(&osdc->map_sem);
+}
+
+static void handle_osds_timeout(struct work_struct *work)
+{
+ struct ceph_osd_client *osdc =
+ container_of(work, struct ceph_osd_client,
+ osds_timeout_work.work);
+ unsigned long delay =
+ osdc->client->options->osd_idle_ttl * HZ >> 2;
+
+ dout("osds timeout\n");
+ down_read(&osdc->map_sem);
+ remove_old_osds(osdc, 0);
+ up_read(&osdc->map_sem);
+
+ schedule_delayed_work(&osdc->osds_timeout_work,
+ round_jiffies_relative(delay));
+}
+
+/*
+ * handle osd op reply. either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ */
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
+ struct ceph_connection *con)
+{
+ struct ceph_osd_reply_head *rhead = msg->front.iov_base;
+ struct ceph_osd_request *req;
+ u64 tid;
+ int numops, object_len, flags;
+ s32 result;
+
+ tid = le64_to_cpu(msg->hdr.tid);
+ if (msg->front.iov_len < sizeof(*rhead))
+ goto bad;
+ numops = le32_to_cpu(rhead->num_ops);
+ object_len = le32_to_cpu(rhead->object_len);
+ result = le32_to_cpu(rhead->result);
+ if (msg->front.iov_len != sizeof(*rhead) + object_len +
+ numops * sizeof(struct ceph_osd_op))
+ goto bad;
+ dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
+
+ /* lookup */
+ mutex_lock(&osdc->request_mutex);
+ req = __lookup_request(osdc, tid);
+ if (req == NULL) {
+ dout("handle_reply tid %llu dne\n", tid);
+ mutex_unlock(&osdc->request_mutex);
+ return;
+ }
+ ceph_osdc_get_request(req);
+ flags = le32_to_cpu(rhead->flags);
+
+ /*
+ * if this connection filled our message, drop our reference now, to
+ * avoid a (safe but slower) revoke later.
+ */
+ if (req->r_con_filling_msg == con && req->r_reply == msg) {
+ dout(" dropping con_filling_msg ref %p\n", con);
+ req->r_con_filling_msg = NULL;
+ ceph_con_put(con);
+ }
+
+ if (!req->r_got_reply) {
+ unsigned bytes;
+
+ req->r_result = le32_to_cpu(rhead->result);
+ bytes = le32_to_cpu(msg->hdr.data_len);
+ dout("handle_reply result %d bytes %d\n", req->r_result,
+ bytes);
+ if (req->r_result == 0)
+ req->r_result = bytes;
+
+ /* in case this is a write and we need to replay, */
+ req->r_reassert_version = rhead->reassert_version;
+
+ req->r_got_reply = 1;
+ } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
+ dout("handle_reply tid %llu dup ack\n", tid);
+ mutex_unlock(&osdc->request_mutex);
+ goto done;
+ }
+
+ dout("handle_reply tid %llu flags %d\n", tid, flags);
+
+ /* either this is a read, or we got the safe response */
+ if (result < 0 ||
+ (flags & CEPH_OSD_FLAG_ONDISK) ||
+ ((flags & CEPH_OSD_FLAG_WRITE) == 0))
+ __unregister_request(osdc, req);
+
+ mutex_unlock(&osdc->request_mutex);
+
+ if (req->r_callback)
+ req->r_callback(req, msg);
+ else
+ complete_all(&req->r_completion);
+
+ if (flags & CEPH_OSD_FLAG_ONDISK) {
+ if (req->r_safe_callback)
+ req->r_safe_callback(req, msg);
+ complete_all(&req->r_safe_completion); /* fsync waiter */
+ }
+
+done:
+ ceph_osdc_put_request(req);
+ return;
+
+bad:
+ pr_err("corrupt osd_op_reply got %d %d expected %d\n",
+ (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
+ (int)sizeof(*rhead));
+ ceph_msg_dump(msg);
+}
+
+
+static int __kick_requests(struct ceph_osd_client *osdc,
+ struct ceph_osd *kickosd)
+{
+ struct ceph_osd_request *req;
+ struct rb_node *p, *n;
+ int needmap = 0;
+ int err;
+
+ dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
+ if (kickosd) {
+ err = __reset_osd(osdc, kickosd);
+ if (err == -EAGAIN)
+ return 1;
+ } else {
+ for (p = rb_first(&osdc->osds); p; p = n) {
+ struct ceph_osd *osd =
+ rb_entry(p, struct ceph_osd, o_node);
+
+ n = rb_next(p);
+ if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+ memcmp(&osd->o_con.peer_addr,
+ ceph_osd_addr(osdc->osdmap,
+ osd->o_osd),
+ sizeof(struct ceph_entity_addr)) != 0)
+ __reset_osd(osdc, osd);
+ }
+ }
+
+ for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+ req = rb_entry(p, struct ceph_osd_request, r_node);
+
+ if (req->r_resend) {
+ dout(" r_resend set on tid %llu\n", req->r_tid);
+ __cancel_request(req);
+ goto kick;
+ }
+ if (req->r_osd && kickosd == req->r_osd) {
+ __cancel_request(req);
+ goto kick;
+ }
+
+ err = __map_osds(osdc, req);
+ if (err == 0)
+ continue; /* no change */
+ if (err < 0) {
+ /*
+ * FIXME: really, we should set the request
+ * error and fail if this isn't a 'nofail'
+ * request, but that's a fair bit more
+ * complicated to do. So retry!
+ */
+ dout(" setting r_resend on %llu\n", req->r_tid);
+ req->r_resend = true;
+ continue;
+ }
+ if (req->r_osd == NULL) {
+ dout("tid %llu maps to no valid osd\n", req->r_tid);
+ needmap++; /* request a newer map */
+ continue;
+ }
+
+kick:
+ dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
+ req->r_osd ? req->r_osd->o_osd : -1);
+ req->r_flags |= CEPH_OSD_FLAG_RETRY;
+ err = __send_request(osdc, req);
+ if (err) {
+ dout(" setting r_resend on %llu\n", req->r_tid);
+ req->r_resend = true;
+ }
+ }
+
+ return needmap;
+}
+
+/*
+ * Resubmit osd requests whose osd or osd address has changed. Request
+ * a new osd map if osds are down, or we are otherwise unable to determine
+ * how to direct a request.
+ *
+ * Close connections to down osds.
+ *
+ * If @who is specified, resubmit requests for that specific osd.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static void kick_requests(struct ceph_osd_client *osdc,
+ struct ceph_osd *kickosd)
+{
+ int needmap;
+
+ mutex_lock(&osdc->request_mutex);
+ needmap = __kick_requests(osdc, kickosd);
+ mutex_unlock(&osdc->request_mutex);
+
+ if (needmap) {
+ dout("%d requests for down osds, need new map\n", needmap);
+ ceph_monc_request_next_osdmap(&osdc->client->monc);
+ }
+
+}
+/*
+ * Process updated osd map.
+ *
+ * The message contains any number of incremental and full maps, normally
+ * indicating some sort of topology change in the cluster. Kick requests
+ * off to different OSDs as needed.
+ */
+void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+ void *p, *end, *next;
+ u32 nr_maps, maplen;
+ u32 epoch;
+ struct ceph_osdmap *newmap = NULL, *oldmap;
+ int err;
+ struct ceph_fsid fsid;
+
+ dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
+ p = msg->front.iov_base;
+ end = p + msg->front.iov_len;
+
+ /* verify fsid */
+ ceph_decode_need(&p, end, sizeof(fsid), bad);
+ ceph_decode_copy(&p, &fsid, sizeof(fsid));
+ if (ceph_check_fsid(osdc->client, &fsid) < 0)
+ return;
+
+ down_write(&osdc->map_sem);
+
+ /* incremental maps */
+ ceph_decode_32_safe(&p, end, nr_maps, bad);
+ dout(" %d inc maps\n", nr_maps);
+ while (nr_maps > 0) {
+ ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+ epoch = ceph_decode_32(&p);
+ maplen = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, maplen, bad);
+ next = p + maplen;
+ if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+ dout("applying incremental map %u len %d\n",
+ epoch, maplen);
+ newmap = osdmap_apply_incremental(&p, next,
+ osdc->osdmap,
+ osdc->client->msgr);
+ if (IS_ERR(newmap)) {
+ err = PTR_ERR(newmap);
+ goto bad;
+ }
+ BUG_ON(!newmap);
+ if (newmap != osdc->osdmap) {
+ ceph_osdmap_destroy(osdc->osdmap);
+ osdc->osdmap = newmap;
+ }
+ } else {
+ dout("ignoring incremental map %u len %d\n",
+ epoch, maplen);
+ }
+ p = next;
+ nr_maps--;
+ }
+ if (newmap)
+ goto done;
+
+ /* full maps */
+ ceph_decode_32_safe(&p, end, nr_maps, bad);
+ dout(" %d full maps\n", nr_maps);
+ while (nr_maps) {
+ ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+ epoch = ceph_decode_32(&p);
+ maplen = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, maplen, bad);
+ if (nr_maps > 1) {
+ dout("skipping non-latest full map %u len %d\n",
+ epoch, maplen);
+ } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+ dout("skipping full map %u len %d, "
+ "older than our %u\n", epoch, maplen,
+ osdc->osdmap->epoch);
+ } else {
+ dout("taking full map %u len %d\n", epoch, maplen);
+ newmap = osdmap_decode(&p, p+maplen);
+ if (IS_ERR(newmap)) {
+ err = PTR_ERR(newmap);
+ goto bad;
+ }
+ BUG_ON(!newmap);
+ oldmap = osdc->osdmap;
+ osdc->osdmap = newmap;
+ if (oldmap)
+ ceph_osdmap_destroy(oldmap);
+ }
+ p += maplen;
+ nr_maps--;
+ }
+
+done:
+ downgrade_write(&osdc->map_sem);
+ ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
+ if (newmap)
+ kick_requests(osdc, NULL);
+ up_read(&osdc->map_sem);
+ wake_up_all(&osdc->client->auth_wq);
+ return;
+
+bad:
+ pr_err("osdc handle_map corrupt msg\n");
+ ceph_msg_dump(msg);
+ up_write(&osdc->map_sem);
+ return;
+}
+
+/*
+ * Register request, send initial attempt.
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req,
+ bool nofail)
+{
+ int rc = 0;
+
+ req->r_request->pages = req->r_pages;
+ req->r_request->nr_pages = req->r_num_pages;
+#ifdef CONFIG_BLOCK
+ req->r_request->bio = req->r_bio;
+#endif
+ req->r_request->trail = req->r_trail;
+
+ register_request(osdc, req);
+
+ down_read(&osdc->map_sem);
+ mutex_lock(&osdc->request_mutex);
+ /*
+ * a racing kick_requests() may have sent the message for us
+ * while we dropped request_mutex above, so only send now if
+ * the request still han't been touched yet.
+ */
+ if (req->r_sent == 0) {
+ rc = __send_request(osdc, req);
+ if (rc) {
+ if (nofail) {
+ dout("osdc_start_request failed send, "
+ " marking %lld\n", req->r_tid);
+ req->r_resend = true;
+ rc = 0;
+ } else {
+ __unregister_request(osdc, req);
+ }
+ }
+ }
+ mutex_unlock(&osdc->request_mutex);
+ up_read(&osdc->map_sem);
+ return rc;
+}
+EXPORT_SYMBOL(ceph_osdc_start_request);
+
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ int rc;
+
+ rc = wait_for_completion_interruptible(&req->r_completion);
+ if (rc < 0) {
+ mutex_lock(&osdc->request_mutex);
+ __cancel_request(req);
+ __unregister_request(osdc, req);
+ mutex_unlock(&osdc->request_mutex);
+ dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
+ return rc;
+ }
+
+ dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
+ return req->r_result;
+}
+EXPORT_SYMBOL(ceph_osdc_wait_request);
+
+/*
+ * sync - wait for all in-flight requests to flush. avoid starvation.
+ */
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
+{
+ struct ceph_osd_request *req;
+ u64 last_tid, next_tid = 0;
+
+ mutex_lock(&osdc->request_mutex);
+ last_tid = osdc->last_tid;
+ while (1) {
+ req = __lookup_request_ge(osdc, next_tid);
+ if (!req)
+ break;
+ if (req->r_tid > last_tid)
+ break;
+
+ next_tid = req->r_tid + 1;
+ if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
+ continue;
+
+ ceph_osdc_get_request(req);
+ mutex_unlock(&osdc->request_mutex);
+ dout("sync waiting on tid %llu (last is %llu)\n",
+ req->r_tid, last_tid);
+ wait_for_completion(&req->r_safe_completion);
+ mutex_lock(&osdc->request_mutex);
+ ceph_osdc_put_request(req);
+ }
+ mutex_unlock(&osdc->request_mutex);
+ dout("sync done (thru tid %llu)\n", last_tid);
+}
+EXPORT_SYMBOL(ceph_osdc_sync);
+
+/*
+ * init, shutdown
+ */
+int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
+{
+ int err;
+
+ dout("init\n");
+ osdc->client = client;
+ osdc->osdmap = NULL;
+ init_rwsem(&osdc->map_sem);
+ init_completion(&osdc->map_waiters);
+ osdc->last_requested_map = 0;
+ mutex_init(&osdc->request_mutex);
+ osdc->last_tid = 0;
+ osdc->osds = RB_ROOT;
+ INIT_LIST_HEAD(&osdc->osd_lru);
+ osdc->requests = RB_ROOT;
+ INIT_LIST_HEAD(&osdc->req_lru);
+ osdc->num_requests = 0;
+ INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
+ INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
+
+ schedule_delayed_work(&osdc->osds_timeout_work,
+ round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
+
+ err = -ENOMEM;
+ osdc->req_mempool = mempool_create_kmalloc_pool(10,
+ sizeof(struct ceph_osd_request));
+ if (!osdc->req_mempool)
+ goto out;
+
+ err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
+ "osd_op");
+ if (err < 0)
+ goto out_mempool;
+ err = ceph_msgpool_init(&osdc->msgpool_op_reply,
+ OSD_OPREPLY_FRONT_LEN, 10, true,
+ "osd_op_reply");
+ if (err < 0)
+ goto out_msgpool;
+ return 0;
+
+out_msgpool:
+ ceph_msgpool_destroy(&osdc->msgpool_op);
+out_mempool:
+ mempool_destroy(osdc->req_mempool);
+out:
+ return err;
+}
+EXPORT_SYMBOL(ceph_osdc_init);
+
+void ceph_osdc_stop(struct ceph_osd_client *osdc)
+{
+ cancel_delayed_work_sync(&osdc->timeout_work);
+ cancel_delayed_work_sync(&osdc->osds_timeout_work);
+ if (osdc->osdmap) {
+ ceph_osdmap_destroy(osdc->osdmap);
+ osdc->osdmap = NULL;
+ }
+ remove_old_osds(osdc, 1);
+ mempool_destroy(osdc->req_mempool);
+ ceph_msgpool_destroy(&osdc->msgpool_op);
+ ceph_msgpool_destroy(&osdc->msgpool_op_reply);
+}
+EXPORT_SYMBOL(ceph_osdc_stop);
+
+/*
+ * Read some contiguous pages. If we cross a stripe boundary, shorten
+ * *plen. Return number of bytes read, or error.
+ */
+int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+ struct ceph_vino vino, struct ceph_file_layout *layout,
+ u64 off, u64 *plen,
+ u32 truncate_seq, u64 truncate_size,
+ struct page **pages, int num_pages)
+{
+ struct ceph_osd_request *req;
+ int rc = 0;
+
+ dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
+ vino.snap, off, *plen);
+ req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
+ CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+ NULL, 0, truncate_seq, truncate_size, NULL,
+ false, 1);
+ if (!req)
+ return -ENOMEM;
+
+ /* it may be a short read due to an object boundary */
+ req->r_pages = pages;
+
+ dout("readpages final extent is %llu~%llu (%d pages)\n",
+ off, *plen, req->r_num_pages);
+
+ rc = ceph_osdc_start_request(osdc, req, false);
+ if (!rc)
+ rc = ceph_osdc_wait_request(osdc, req);
+
+ ceph_osdc_put_request(req);
+ dout("readpages result %d\n", rc);
+ return rc;
+}
+EXPORT_SYMBOL(ceph_osdc_readpages);
+
+/*
+ * do a synchronous write on N pages
+ */
+int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
+ struct ceph_file_layout *layout,
+ struct ceph_snap_context *snapc,
+ u64 off, u64 len,
+ u32 truncate_seq, u64 truncate_size,
+ struct timespec *mtime,
+ struct page **pages, int num_pages,
+ int flags, int do_sync, bool nofail)
+{
+ struct ceph_osd_request *req;
+ int rc = 0;
+
+ BUG_ON(vino.snap != CEPH_NOSNAP);
+ req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
+ CEPH_OSD_OP_WRITE,
+ flags | CEPH_OSD_FLAG_ONDISK |
+ CEPH_OSD_FLAG_WRITE,
+ snapc, do_sync,
+ truncate_seq, truncate_size, mtime,
+ nofail, 1);
+ if (!req)
+ return -ENOMEM;
+
+ /* it may be a short write due to an object boundary */
+ req->r_pages = pages;
+ dout("writepages %llu~%llu (%d pages)\n", off, len,
+ req->r_num_pages);
+
+ rc = ceph_osdc_start_request(osdc, req, nofail);
+ if (!rc)
+ rc = ceph_osdc_wait_request(osdc, req);
+
+ ceph_osdc_put_request(req);
+ if (rc == 0)
+ rc = len;
+ dout("writepages result %d\n", rc);
+ return rc;
+}
+EXPORT_SYMBOL(ceph_osdc_writepages);
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc;
+ int type = le16_to_cpu(msg->hdr.type);
+
+ if (!osd)
+ goto out;
+ osdc = osd->o_osdc;
+
+ switch (type) {
+ case CEPH_MSG_OSD_MAP:
+ ceph_osdc_handle_map(osdc, msg);
+ break;
+ case CEPH_MSG_OSD_OPREPLY:
+ handle_reply(osdc, msg, con);
+ break;
+
+ default:
+ pr_err("received unknown message type %d %s\n", type,
+ ceph_msg_type_name(type));
+ }
+out:
+ ceph_msg_put(msg);
+}
+
+/*
+ * lookup and return message for incoming reply. set up reply message
+ * pages.
+ */
+static struct ceph_msg *get_reply(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip)
+{
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ struct ceph_msg *m;
+ struct ceph_osd_request *req;
+ int front = le32_to_cpu(hdr->front_len);
+ int data_len = le32_to_cpu(hdr->data_len);
+ u64 tid;
+
+ tid = le64_to_cpu(hdr->tid);
+ mutex_lock(&osdc->request_mutex);
+ req = __lookup_request(osdc, tid);
+ if (!req) {
+ *skip = 1;
+ m = NULL;
+ pr_info("get_reply unknown tid %llu from osd%d\n", tid,
+ osd->o_osd);
+ goto out;
+ }
+
+ if (req->r_con_filling_msg) {
+ dout("get_reply revoking msg %p from old con %p\n",
+ req->r_reply, req->r_con_filling_msg);
+ ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
+ ceph_con_put(req->r_con_filling_msg);
+ req->r_con_filling_msg = NULL;
+ }
+
+ if (front > req->r_reply->front.iov_len) {
+ pr_warning("get_reply front %d > preallocated %d\n",
+ front, (int)req->r_reply->front.iov_len);
+ m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
+ if (!m)
+ goto out;
+ ceph_msg_put(req->r_reply);
+ req->r_reply = m;
+ }
+ m = ceph_msg_get(req->r_reply);
+
+ if (data_len > 0) {
+ unsigned data_off = le16_to_cpu(hdr->data_off);
+ int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+
+ if (unlikely(req->r_num_pages < want)) {
+ pr_warning("tid %lld reply %d > expected %d pages\n",
+ tid, want, m->nr_pages);
+ *skip = 1;
+ ceph_msg_put(m);
+ m = NULL;
+ goto out;
+ }
+ m->pages = req->r_pages;
+ m->nr_pages = req->r_num_pages;
+#ifdef CONFIG_BLOCK
+ m->bio = req->r_bio;
+#endif
+ }
+ *skip = 0;
+ req->r_con_filling_msg = ceph_con_get(con);
+ dout("get_reply tid %lld %p\n", tid, m);
+
+out:
+ mutex_unlock(&osdc->request_mutex);
+ return m;
+
+}
+
+static struct ceph_msg *alloc_msg(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip)
+{
+ struct ceph_osd *osd = con->private;
+ int type = le16_to_cpu(hdr->type);
+ int front = le32_to_cpu(hdr->front_len);
+
+ switch (type) {
+ case CEPH_MSG_OSD_MAP:
+ return ceph_msg_new(type, front, GFP_NOFS);
+ case CEPH_MSG_OSD_OPREPLY:
+ return get_reply(con, hdr, skip);
+ default:
+ pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
+ osd->o_osd);
+ *skip = 1;
+ return NULL;
+ }
+}
+
+/*
+ * Wrappers to refcount containing ceph_osd struct
+ */
+static struct ceph_connection *get_osd_con(struct ceph_connection *con)
+{
+ struct ceph_osd *osd = con->private;
+ if (get_osd(osd))
+ return con;
+ return NULL;
+}
+
+static void put_osd_con(struct ceph_connection *con)
+{
+ struct ceph_osd *osd = con->private;
+ put_osd(osd);
+}
+
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+ void **buf, int *len, int *proto,
+ void **reply_buf, int *reply_len, int force_new)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_osd_client *osdc = o->o_osdc;
+ struct ceph_auth_client *ac = osdc->client->monc.auth;
+ int ret = 0;
+
+ if (force_new && o->o_authorizer) {
+ ac->ops->destroy_authorizer(ac, o->o_authorizer);
+ o->o_authorizer = NULL;
+ }
+ if (o->o_authorizer == NULL) {
+ ret = ac->ops->create_authorizer(
+ ac, CEPH_ENTITY_TYPE_OSD,
+ &o->o_authorizer,
+ &o->o_authorizer_buf,
+ &o->o_authorizer_buf_len,
+ &o->o_authorizer_reply_buf,
+ &o->o_authorizer_reply_buf_len);
+ if (ret)
+ return ret;
+ }
+
+ *proto = ac->protocol;
+ *buf = o->o_authorizer_buf;
+ *len = o->o_authorizer_buf_len;
+ *reply_buf = o->o_authorizer_reply_buf;
+ *reply_len = o->o_authorizer_reply_buf_len;
+ return 0;
+}
+
+
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_osd_client *osdc = o->o_osdc;
+ struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+ return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
+}
+
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_osd_client *osdc = o->o_osdc;
+ struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+ if (ac->ops->invalidate_authorizer)
+ ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
+
+ return ceph_monc_validate_auth(&osdc->client->monc);
+}
+
+static const struct ceph_connection_operations osd_con_ops = {
+ .get = get_osd_con,
+ .put = put_osd_con,
+ .dispatch = dispatch,
+ .get_authorizer = get_authorizer,
+ .verify_authorizer_reply = verify_authorizer_reply,
+ .invalidate_authorizer = invalidate_authorizer,
+ .alloc_msg = alloc_msg,
+ .fault = osd_reset,
+};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
new file mode 100644
index 00000000000..d73f3f6efa3
--- /dev/null
+++ b/net/ceph/osdmap.c
@@ -0,0 +1,1128 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/div64.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osdmap.h>
+#include <linux/ceph/decode.h>
+#include <linux/crush/hash.h>
+#include <linux/crush/mapper.h>
+
+char *ceph_osdmap_state_str(char *str, int len, int state)
+{
+ int flag = 0;
+
+ if (!len)
+ goto done;
+
+ *str = '\0';
+ if (state) {
+ if (state & CEPH_OSD_EXISTS) {
+ snprintf(str, len, "exists");
+ flag = 1;
+ }
+ if (state & CEPH_OSD_UP) {
+ snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
+ "up");
+ flag = 1;
+ }
+ } else {
+ snprintf(str, len, "doesn't exist");
+ }
+done:
+ return str;
+}
+
+/* maps */
+
+static int calc_bits_of(unsigned t)
+{
+ int b = 0;
+ while (t) {
+ t = t >> 1;
+ b++;
+ }
+ return b;
+}
+
+/*
+ * the foo_mask is the smallest value 2^n-1 that is >= foo.
+ */
+static void calc_pg_masks(struct ceph_pg_pool_info *pi)
+{
+ pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
+ pi->pgp_num_mask =
+ (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
+ pi->lpg_num_mask =
+ (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
+ pi->lpgp_num_mask =
+ (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
+}
+
+/*
+ * decode crush map
+ */
+static int crush_decode_uniform_bucket(void **p, void *end,
+ struct crush_bucket_uniform *b)
+{
+ dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
+ ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
+ b->item_weight = ceph_decode_32(p);
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+static int crush_decode_list_bucket(void **p, void *end,
+ struct crush_bucket_list *b)
+{
+ int j;
+ dout("crush_decode_list_bucket %p to %p\n", *p, end);
+ b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->item_weights == NULL)
+ return -ENOMEM;
+ b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->sum_weights == NULL)
+ return -ENOMEM;
+ ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+ for (j = 0; j < b->h.size; j++) {
+ b->item_weights[j] = ceph_decode_32(p);
+ b->sum_weights[j] = ceph_decode_32(p);
+ }
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+static int crush_decode_tree_bucket(void **p, void *end,
+ struct crush_bucket_tree *b)
+{
+ int j;
+ dout("crush_decode_tree_bucket %p to %p\n", *p, end);
+ ceph_decode_32_safe(p, end, b->num_nodes, bad);
+ b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
+ if (b->node_weights == NULL)
+ return -ENOMEM;
+ ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
+ for (j = 0; j < b->num_nodes; j++)
+ b->node_weights[j] = ceph_decode_32(p);
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+static int crush_decode_straw_bucket(void **p, void *end,
+ struct crush_bucket_straw *b)
+{
+ int j;
+ dout("crush_decode_straw_bucket %p to %p\n", *p, end);
+ b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->item_weights == NULL)
+ return -ENOMEM;
+ b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->straws == NULL)
+ return -ENOMEM;
+ ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+ for (j = 0; j < b->h.size; j++) {
+ b->item_weights[j] = ceph_decode_32(p);
+ b->straws[j] = ceph_decode_32(p);
+ }
+ return 0;
+bad:
+ return -EINVAL;
+}
+
+static struct crush_map *crush_decode(void *pbyval, void *end)
+{
+ struct crush_map *c;
+ int err = -EINVAL;
+ int i, j;
+ void **p = &pbyval;
+ void *start = pbyval;
+ u32 magic;
+
+ dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+ c = kzalloc(sizeof(*c), GFP_NOFS);
+ if (c == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ ceph_decode_need(p, end, 4*sizeof(u32), bad);
+ magic = ceph_decode_32(p);
+ if (magic != CRUSH_MAGIC) {
+ pr_err("crush_decode magic %x != current %x\n",
+ (unsigned)magic, (unsigned)CRUSH_MAGIC);
+ goto bad;
+ }
+ c->max_buckets = ceph_decode_32(p);
+ c->max_rules = ceph_decode_32(p);
+ c->max_devices = ceph_decode_32(p);
+
+ c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
+ if (c->device_parents == NULL)
+ goto badmem;
+ c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
+ if (c->bucket_parents == NULL)
+ goto badmem;
+
+ c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
+ if (c->buckets == NULL)
+ goto badmem;
+ c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
+ if (c->rules == NULL)
+ goto badmem;
+
+ /* buckets */
+ for (i = 0; i < c->max_buckets; i++) {
+ int size = 0;
+ u32 alg;
+ struct crush_bucket *b;
+
+ ceph_decode_32_safe(p, end, alg, bad);
+ if (alg == 0) {
+ c->buckets[i] = NULL;
+ continue;
+ }
+ dout("crush_decode bucket %d off %x %p to %p\n",
+ i, (int)(*p-start), *p, end);
+
+ switch (alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ size = sizeof(struct crush_bucket_uniform);
+ break;
+ case CRUSH_BUCKET_LIST:
+ size = sizeof(struct crush_bucket_list);
+ break;
+ case CRUSH_BUCKET_TREE:
+ size = sizeof(struct crush_bucket_tree);
+ break;
+ case CRUSH_BUCKET_STRAW:
+ size = sizeof(struct crush_bucket_straw);
+ break;
+ default:
+ err = -EINVAL;
+ goto bad;
+ }
+ BUG_ON(size == 0);
+ b = c->buckets[i] = kzalloc(size, GFP_NOFS);
+ if (b == NULL)
+ goto badmem;
+
+ ceph_decode_need(p, end, 4*sizeof(u32), bad);
+ b->id = ceph_decode_32(p);
+ b->type = ceph_decode_16(p);
+ b->alg = ceph_decode_8(p);
+ b->hash = ceph_decode_8(p);
+ b->weight = ceph_decode_32(p);
+ b->size = ceph_decode_32(p);
+
+ dout("crush_decode bucket size %d off %x %p to %p\n",
+ b->size, (int)(*p-start), *p, end);
+
+ b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
+ if (b->items == NULL)
+ goto badmem;
+ b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
+ if (b->perm == NULL)
+ goto badmem;
+ b->perm_n = 0;
+
+ ceph_decode_need(p, end, b->size*sizeof(u32), bad);
+ for (j = 0; j < b->size; j++)
+ b->items[j] = ceph_decode_32(p);
+
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ err = crush_decode_uniform_bucket(p, end,
+ (struct crush_bucket_uniform *)b);
+ if (err < 0)
+ goto bad;
+ break;
+ case CRUSH_BUCKET_LIST:
+ err = crush_decode_list_bucket(p, end,
+ (struct crush_bucket_list *)b);
+ if (err < 0)
+ goto bad;
+ break;
+ case CRUSH_BUCKET_TREE:
+ err = crush_decode_tree_bucket(p, end,
+ (struct crush_bucket_tree *)b);
+ if (err < 0)
+ goto bad;
+ break;
+ case CRUSH_BUCKET_STRAW:
+ err = crush_decode_straw_bucket(p, end,
+ (struct crush_bucket_straw *)b);
+ if (err < 0)
+ goto bad;
+ break;
+ }
+ }
+
+ /* rules */
+ dout("rule vec is %p\n", c->rules);
+ for (i = 0; i < c->max_rules; i++) {
+ u32 yes;
+ struct crush_rule *r;
+
+ ceph_decode_32_safe(p, end, yes, bad);
+ if (!yes) {
+ dout("crush_decode NO rule %d off %x %p to %p\n",
+ i, (int)(*p-start), *p, end);
+ c->rules[i] = NULL;
+ continue;
+ }
+
+ dout("crush_decode rule %d off %x %p to %p\n",
+ i, (int)(*p-start), *p, end);
+
+ /* len */
+ ceph_decode_32_safe(p, end, yes, bad);
+#if BITS_PER_LONG == 32
+ err = -EINVAL;
+ if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
+ goto bad;
+#endif
+ r = c->rules[i] = kmalloc(sizeof(*r) +
+ yes*sizeof(struct crush_rule_step),
+ GFP_NOFS);
+ if (r == NULL)
+ goto badmem;
+ dout(" rule %d is at %p\n", i, r);
+ r->len = yes;
+ ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
+ ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
+ for (j = 0; j < r->len; j++) {
+ r->steps[j].op = ceph_decode_32(p);
+ r->steps[j].arg1 = ceph_decode_32(p);
+ r->steps[j].arg2 = ceph_decode_32(p);
+ }
+ }
+
+ /* ignore trailing name maps. */
+
+ dout("crush_decode success\n");
+ return c;
+
+badmem:
+ err = -ENOMEM;
+bad:
+ dout("crush_decode fail %d\n", err);
+ crush_destroy(c);
+ return ERR_PTR(err);
+}
+
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds)
+ */
+static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+{
+ u64 a = *(u64 *)&l;
+ u64 b = *(u64 *)&r;
+
+ if (a < b)
+ return -1;
+ if (a > b)
+ return 1;
+ return 0;
+}
+
+static int __insert_pg_mapping(struct ceph_pg_mapping *new,
+ struct rb_root *root)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_pg_mapping *pg = NULL;
+ int c;
+
+ while (*p) {
+ parent = *p;
+ pg = rb_entry(parent, struct ceph_pg_mapping, node);
+ c = pgid_cmp(new->pgid, pg->pgid);
+ if (c < 0)
+ p = &(*p)->rb_left;
+ else if (c > 0)
+ p = &(*p)->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&new->node, parent, p);
+ rb_insert_color(&new->node, root);
+ return 0;
+}
+
+static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
+ struct ceph_pg pgid)
+{
+ struct rb_node *n = root->rb_node;
+ struct ceph_pg_mapping *pg;
+ int c;
+
+ while (n) {
+ pg = rb_entry(n, struct ceph_pg_mapping, node);
+ c = pgid_cmp(pgid, pg->pgid);
+ if (c < 0)
+ n = n->rb_left;
+ else if (c > 0)
+ n = n->rb_right;
+ else
+ return pg;
+ }
+ return NULL;
+}
+
+/*
+ * rbtree of pg pool info
+ */
+static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_pg_pool_info *pi = NULL;
+
+ while (*p) {
+ parent = *p;
+ pi = rb_entry(parent, struct ceph_pg_pool_info, node);
+ if (new->id < pi->id)
+ p = &(*p)->rb_left;
+ else if (new->id > pi->id)
+ p = &(*p)->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&new->node, parent, p);
+ rb_insert_color(&new->node, root);
+ return 0;
+}
+
+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
+{
+ struct ceph_pg_pool_info *pi;
+ struct rb_node *n = root->rb_node;
+
+ while (n) {
+ pi = rb_entry(n, struct ceph_pg_pool_info, node);
+ if (id < pi->id)
+ n = n->rb_left;
+ else if (id > pi->id)
+ n = n->rb_right;
+ else
+ return pi;
+ }
+ return NULL;
+}
+
+int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
+{
+ struct rb_node *rbp;
+
+ for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(rbp, struct ceph_pg_pool_info, node);
+ if (pi->name && strcmp(pi->name, name) == 0)
+ return pi->id;
+ }
+ return -ENOENT;
+}
+EXPORT_SYMBOL(ceph_pg_poolid_by_name);
+
+static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
+{
+ rb_erase(&pi->node, root);
+ kfree(pi->name);
+ kfree(pi);
+}
+
+static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
+{
+ unsigned n, m;
+
+ ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+ calc_pg_masks(pi);
+
+ /* num_snaps * snap_info_t */
+ n = le32_to_cpu(pi->v.num_snaps);
+ while (n--) {
+ ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
+ sizeof(struct ceph_timespec), bad);
+ *p += sizeof(u64) + /* key */
+ 1 + sizeof(u64) + /* u8, snapid */
+ sizeof(struct ceph_timespec);
+ m = ceph_decode_32(p); /* snap name */
+ *p += m;
+ }
+
+ *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+ return 0;
+
+bad:
+ return -EINVAL;
+}
+
+static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+{
+ struct ceph_pg_pool_info *pi;
+ u32 num, len, pool;
+
+ ceph_decode_32_safe(p, end, num, bad);
+ dout(" %d pool names\n", num);
+ while (num--) {
+ ceph_decode_32_safe(p, end, pool, bad);
+ ceph_decode_32_safe(p, end, len, bad);
+ dout(" pool %d len %d\n", pool, len);
+ pi = __lookup_pg_pool(&map->pg_pools, pool);
+ if (pi) {
+ kfree(pi->name);
+ pi->name = kmalloc(len + 1, GFP_NOFS);
+ if (pi->name) {
+ memcpy(pi->name, *p, len);
+ pi->name[len] = '\0';
+ dout(" name is %s\n", pi->name);
+ }
+ }
+ *p += len;
+ }
+ return 0;
+
+bad:
+ return -EINVAL;
+}
+
+/*
+ * osd map
+ */
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+ dout("osdmap_destroy %p\n", map);
+ if (map->crush)
+ crush_destroy(map->crush);
+ while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(rb_first(&map->pg_temp),
+ struct ceph_pg_mapping, node);
+ rb_erase(&pg->node, &map->pg_temp);
+ kfree(pg);
+ }
+ while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(rb_first(&map->pg_pools),
+ struct ceph_pg_pool_info, node);
+ __remove_pg_pool(&map->pg_pools, pi);
+ }
+ kfree(map->osd_state);
+ kfree(map->osd_weight);
+ kfree(map->osd_addr);
+ kfree(map);
+}
+
+/*
+ * adjust max osd value. reallocate arrays.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+{
+ u8 *state;
+ struct ceph_entity_addr *addr;
+ u32 *weight;
+
+ state = kcalloc(max, sizeof(*state), GFP_NOFS);
+ addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
+ weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
+ if (state == NULL || addr == NULL || weight == NULL) {
+ kfree(state);
+ kfree(addr);
+ kfree(weight);
+ return -ENOMEM;
+ }
+
+ /* copy old? */
+ if (map->osd_state) {
+ memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
+ memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
+ memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
+ kfree(map->osd_state);
+ kfree(map->osd_addr);
+ kfree(map->osd_weight);
+ }
+
+ map->osd_state = state;
+ map->osd_weight = weight;
+ map->osd_addr = addr;
+ map->max_osd = max;
+ return 0;
+}
+
+/*
+ * decode a full map.
+ */
+struct ceph_osdmap *osdmap_decode(void **p, void *end)
+{
+ struct ceph_osdmap *map;
+ u16 version;
+ u32 len, max, i;
+ u8 ev;
+ int err = -EINVAL;
+ void *start = *p;
+ struct ceph_pg_pool_info *pi;
+
+ dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+ map = kzalloc(sizeof(*map), GFP_NOFS);
+ if (map == NULL)
+ return ERR_PTR(-ENOMEM);
+ map->pg_temp = RB_ROOT;
+
+ ceph_decode_16_safe(p, end, version, bad);
+ if (version > CEPH_OSDMAP_VERSION) {
+ pr_warning("got unknown v %d > %d of osdmap\n", version,
+ CEPH_OSDMAP_VERSION);
+ goto bad;
+ }
+
+ ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
+ ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
+ map->epoch = ceph_decode_32(p);
+ ceph_decode_copy(p, &map->created, sizeof(map->created));
+ ceph_decode_copy(p, &map->modified, sizeof(map->modified));
+
+ ceph_decode_32_safe(p, end, max, bad);
+ while (max--) {
+ ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+ pi = kzalloc(sizeof(*pi), GFP_NOFS);
+ if (!pi)
+ goto bad;
+ pi->id = ceph_decode_32(p);
+ ev = ceph_decode_8(p); /* encoding version */
+ if (ev > CEPH_PG_POOL_VERSION) {
+ pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+ ev, CEPH_PG_POOL_VERSION);
+ kfree(pi);
+ goto bad;
+ }
+ err = __decode_pool(p, end, pi);
+ if (err < 0)
+ goto bad;
+ __insert_pg_pool(&map->pg_pools, pi);
+ }
+
+ if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+ goto bad;
+
+ ceph_decode_32_safe(p, end, map->pool_max, bad);
+
+ ceph_decode_32_safe(p, end, map->flags, bad);
+
+ max = ceph_decode_32(p);
+
+ /* (re)alloc osd arrays */
+ err = osdmap_set_max_osd(map, max);
+ if (err < 0)
+ goto bad;
+ dout("osdmap_decode max_osd = %d\n", map->max_osd);
+
+ /* osds */
+ err = -EINVAL;
+ ceph_decode_need(p, end, 3*sizeof(u32) +
+ map->max_osd*(1 + sizeof(*map->osd_weight) +
+ sizeof(*map->osd_addr)), bad);
+ *p += 4; /* skip length field (should match max) */
+ ceph_decode_copy(p, map->osd_state, map->max_osd);
+
+ *p += 4; /* skip length field (should match max) */
+ for (i = 0; i < map->max_osd; i++)
+ map->osd_weight[i] = ceph_decode_32(p);
+
+ *p += 4; /* skip length field (should match max) */
+ ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
+ for (i = 0; i < map->max_osd; i++)
+ ceph_decode_addr(&map->osd_addr[i]);
+
+ /* pg_temp */
+ ceph_decode_32_safe(p, end, len, bad);
+ for (i = 0; i < len; i++) {
+ int n, j;
+ struct ceph_pg pgid;
+ struct ceph_pg_mapping *pg;
+
+ ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
+ ceph_decode_copy(p, &pgid, sizeof(pgid));
+ n = ceph_decode_32(p);
+ ceph_decode_need(p, end, n * sizeof(u32), bad);
+ err = -ENOMEM;
+ pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
+ if (!pg)
+ goto bad;
+ pg->pgid = pgid;
+ pg->len = n;
+ for (j = 0; j < n; j++)
+ pg->osds[j] = ceph_decode_32(p);
+
+ err = __insert_pg_mapping(pg, &map->pg_temp);
+ if (err)
+ goto bad;
+ dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
+ }
+
+ /* crush */
+ ceph_decode_32_safe(p, end, len, bad);
+ dout("osdmap_decode crush len %d from off 0x%x\n", len,
+ (int)(*p - start));
+ ceph_decode_need(p, end, len, bad);
+ map->crush = crush_decode(*p, end);
+ *p += len;
+ if (IS_ERR(map->crush)) {
+ err = PTR_ERR(map->crush);
+ map->crush = NULL;
+ goto bad;
+ }
+
+ /* ignore the rest of the map */
+ *p = end;
+
+ dout("osdmap_decode done %p %p\n", *p, end);
+ return map;
+
+bad:
+ dout("osdmap_decode fail\n");
+ ceph_osdmap_destroy(map);
+ return ERR_PTR(err);
+}
+
+/*
+ * decode and apply an incremental map update.
+ */
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+ struct ceph_osdmap *map,
+ struct ceph_messenger *msgr)
+{
+ struct crush_map *newcrush = NULL;
+ struct ceph_fsid fsid;
+ u32 epoch = 0;
+ struct ceph_timespec modified;
+ u32 len, pool;
+ __s32 new_pool_max, new_flags, max;
+ void *start = *p;
+ int err = -EINVAL;
+ u16 version;
+ struct rb_node *rbp;
+
+ ceph_decode_16_safe(p, end, version, bad);
+ if (version > CEPH_OSDMAP_INC_VERSION) {
+ pr_warning("got unknown v %d > %d of inc osdmap\n", version,
+ CEPH_OSDMAP_INC_VERSION);
+ goto bad;
+ }
+
+ ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
+ bad);
+ ceph_decode_copy(p, &fsid, sizeof(fsid));
+ epoch = ceph_decode_32(p);
+ BUG_ON(epoch != map->epoch+1);
+ ceph_decode_copy(p, &modified, sizeof(modified));
+ new_pool_max = ceph_decode_32(p);
+ new_flags = ceph_decode_32(p);
+
+ /* full map? */
+ ceph_decode_32_safe(p, end, len, bad);
+ if (len > 0) {
+ dout("apply_incremental full map len %d, %p to %p\n",
+ len, *p, end);
+ return osdmap_decode(p, min(*p+len, end));
+ }
+
+ /* new crush? */
+ ceph_decode_32_safe(p, end, len, bad);
+ if (len > 0) {
+ dout("apply_incremental new crush map len %d, %p to %p\n",
+ len, *p, end);
+ newcrush = crush_decode(*p, min(*p+len, end));
+ if (IS_ERR(newcrush))
+ return ERR_CAST(newcrush);
+ *p += len;
+ }
+
+ /* new flags? */
+ if (new_flags >= 0)
+ map->flags = new_flags;
+ if (new_pool_max >= 0)
+ map->pool_max = new_pool_max;
+
+ ceph_decode_need(p, end, 5*sizeof(u32), bad);
+
+ /* new max? */
+ max = ceph_decode_32(p);
+ if (max >= 0) {
+ err = osdmap_set_max_osd(map, max);
+ if (err < 0)
+ goto bad;
+ }
+
+ map->epoch++;
+ map->modified = map->modified;
+ if (newcrush) {
+ if (map->crush)
+ crush_destroy(map->crush);
+ map->crush = newcrush;
+ newcrush = NULL;
+ }
+
+ /* new_pool */
+ ceph_decode_32_safe(p, end, len, bad);
+ while (len--) {
+ __u8 ev;
+ struct ceph_pg_pool_info *pi;
+
+ ceph_decode_32_safe(p, end, pool, bad);
+ ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
+ ev = ceph_decode_8(p); /* encoding version */
+ if (ev > CEPH_PG_POOL_VERSION) {
+ pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+ ev, CEPH_PG_POOL_VERSION);
+ goto bad;
+ }
+ pi = __lookup_pg_pool(&map->pg_pools, pool);
+ if (!pi) {
+ pi = kzalloc(sizeof(*pi), GFP_NOFS);
+ if (!pi) {
+ err = -ENOMEM;
+ goto bad;
+ }
+ pi->id = pool;
+ __insert_pg_pool(&map->pg_pools, pi);
+ }
+ err = __decode_pool(p, end, pi);
+ if (err < 0)
+ goto bad;
+ }
+ if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+ goto bad;
+
+ /* old_pool */
+ ceph_decode_32_safe(p, end, len, bad);
+ while (len--) {
+ struct ceph_pg_pool_info *pi;
+
+ ceph_decode_32_safe(p, end, pool, bad);
+ pi = __lookup_pg_pool(&map->pg_pools, pool);
+ if (pi)
+ __remove_pg_pool(&map->pg_pools, pi);
+ }
+
+ /* new_up */
+ err = -EINVAL;
+ ceph_decode_32_safe(p, end, len, bad);
+ while (len--) {
+ u32 osd;
+ struct ceph_entity_addr addr;
+ ceph_decode_32_safe(p, end, osd, bad);
+ ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
+ ceph_decode_addr(&addr);
+ pr_info("osd%d up\n", osd);
+ BUG_ON(osd >= map->max_osd);
+ map->osd_state[osd] |= CEPH_OSD_UP;
+ map->osd_addr[osd] = addr;
+ }
+
+ /* new_down */
+ ceph_decode_32_safe(p, end, len, bad);
+ while (len--) {
+ u32 osd;
+ ceph_decode_32_safe(p, end, osd, bad);
+ (*p)++; /* clean flag */
+ pr_info("osd%d down\n", osd);
+ if (osd < map->max_osd)
+ map->osd_state[osd] &= ~CEPH_OSD_UP;
+ }
+
+ /* new_weight */
+ ceph_decode_32_safe(p, end, len, bad);
+ while (len--) {
+ u32 osd, off;
+ ceph_decode_need(p, end, sizeof(u32)*2, bad);
+ osd = ceph_decode_32(p);
+ off = ceph_decode_32(p);
+ pr_info("osd%d weight 0x%x %s\n", osd, off,
+ off == CEPH_OSD_IN ? "(in)" :
+ (off == CEPH_OSD_OUT ? "(out)" : ""));
+ if (osd < map->max_osd)
+ map->osd_weight[osd] = off;
+ }
+
+ /* new_pg_temp */
+ rbp = rb_first(&map->pg_temp);
+ ceph_decode_32_safe(p, end, len, bad);
+ while (len--) {
+ struct ceph_pg_mapping *pg;
+ int j;
+ struct ceph_pg pgid;
+ u32 pglen;
+ ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
+ ceph_decode_copy(p, &pgid, sizeof(pgid));
+ pglen = ceph_decode_32(p);
+
+ /* remove any? */
+ while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
+ node)->pgid, pgid) <= 0) {
+ struct ceph_pg_mapping *cur =
+ rb_entry(rbp, struct ceph_pg_mapping, node);
+
+ rbp = rb_next(rbp);
+ dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
+ rb_erase(&cur->node, &map->pg_temp);
+ kfree(cur);
+ }
+
+ if (pglen) {
+ /* insert */
+ ceph_decode_need(p, end, pglen*sizeof(u32), bad);
+ pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
+ if (!pg) {
+ err = -ENOMEM;
+ goto bad;
+ }
+ pg->pgid = pgid;
+ pg->len = pglen;
+ for (j = 0; j < pglen; j++)
+ pg->osds[j] = ceph_decode_32(p);
+ err = __insert_pg_mapping(pg, &map->pg_temp);
+ if (err) {
+ kfree(pg);
+ goto bad;
+ }
+ dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
+ pglen);
+ }
+ }
+ while (rbp) {
+ struct ceph_pg_mapping *cur =
+ rb_entry(rbp, struct ceph_pg_mapping, node);
+
+ rbp = rb_next(rbp);
+ dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
+ rb_erase(&cur->node, &map->pg_temp);
+ kfree(cur);
+ }
+
+ /* ignore the rest */
+ *p = end;
+ return map;
+
+bad:
+ pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
+ epoch, (int)(*p - start), *p, start, end);
+ print_hex_dump(KERN_DEBUG, "osdmap: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ start, end - start, true);
+ if (newcrush)
+ crush_destroy(newcrush);
+ return ERR_PTR(err);
+}
+
+
+
+
+/*
+ * calculate file layout from given offset, length.
+ * fill in correct oid, logical length, and object extent
+ * offset, length.
+ *
+ * for now, we write only a single su, until we can
+ * pass a stride back to the caller.
+ */
+void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+ u64 off, u64 *plen,
+ u64 *ono,
+ u64 *oxoff, u64 *oxlen)
+{
+ u32 osize = le32_to_cpu(layout->fl_object_size);
+ u32 su = le32_to_cpu(layout->fl_stripe_unit);
+ u32 sc = le32_to_cpu(layout->fl_stripe_count);
+ u32 bl, stripeno, stripepos, objsetno;
+ u32 su_per_object;
+ u64 t, su_offset;
+
+ dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
+ osize, su);
+ su_per_object = osize / su;
+ dout("osize %u / su %u = su_per_object %u\n", osize, su,
+ su_per_object);
+
+ BUG_ON((su & ~PAGE_MASK) != 0);
+ /* bl = *off / su; */
+ t = off;
+ do_div(t, su);
+ bl = t;
+ dout("off %llu / su %u = bl %u\n", off, su, bl);
+
+ stripeno = bl / sc;
+ stripepos = bl % sc;
+ objsetno = stripeno / su_per_object;
+
+ *ono = objsetno * sc + stripepos;
+ dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
+
+ /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
+ t = off;
+ su_offset = do_div(t, su);
+ *oxoff = su_offset + (stripeno % su_per_object) * su;
+
+ /*
+ * Calculate the length of the extent being written to the selected
+ * object. This is the minimum of the full length requested (plen) or
+ * the remainder of the current stripe being written to.
+ */
+ *oxlen = min_t(u64, *plen, su - su_offset);
+ *plen = *oxlen;
+
+ dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
+}
+EXPORT_SYMBOL(ceph_calc_file_object_mapping);
+
+/*
+ * calculate an object layout (i.e. pgid) from an oid,
+ * file_layout, and osdmap
+ */
+int ceph_calc_object_layout(struct ceph_object_layout *ol,
+ const char *oid,
+ struct ceph_file_layout *fl,
+ struct ceph_osdmap *osdmap)
+{
+ unsigned num, num_mask;
+ struct ceph_pg pgid;
+ s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
+ int poolid = le32_to_cpu(fl->fl_pg_pool);
+ struct ceph_pg_pool_info *pool;
+ unsigned ps;
+
+ BUG_ON(!osdmap);
+
+ pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+ if (!pool)
+ return -EIO;
+ ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
+ if (preferred >= 0) {
+ ps += preferred;
+ num = le32_to_cpu(pool->v.lpg_num);
+ num_mask = pool->lpg_num_mask;
+ } else {
+ num = le32_to_cpu(pool->v.pg_num);
+ num_mask = pool->pg_num_mask;
+ }
+
+ pgid.ps = cpu_to_le16(ps);
+ pgid.preferred = cpu_to_le16(preferred);
+ pgid.pool = fl->fl_pg_pool;
+ if (preferred >= 0)
+ dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
+ (int)preferred);
+ else
+ dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
+
+ ol->ol_pgid = pgid;
+ ol->ol_stripe_unit = fl->fl_object_stripe_unit;
+ return 0;
+}
+EXPORT_SYMBOL(ceph_calc_object_layout);
+
+/*
+ * Calculate raw osd vector for the given pgid. Return pointer to osd
+ * array, or NULL on failure.
+ */
+static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+ int *osds, int *num)
+{
+ struct ceph_pg_mapping *pg;
+ struct ceph_pg_pool_info *pool;
+ int ruleno;
+ unsigned poolid, ps, pps;
+ int preferred;
+
+ /* pg_temp? */
+ pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+ if (pg) {
+ *num = pg->len;
+ return pg->osds;
+ }
+
+ /* crush */
+ poolid = le32_to_cpu(pgid.pool);
+ ps = le16_to_cpu(pgid.ps);
+ preferred = (s16)le16_to_cpu(pgid.preferred);
+
+ /* don't forcefeed bad device ids to crush */
+ if (preferred >= osdmap->max_osd ||
+ preferred >= osdmap->crush->max_devices)
+ preferred = -1;
+
+ pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+ if (!pool)
+ return NULL;
+ ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
+ pool->v.type, pool->v.size);
+ if (ruleno < 0) {
+ pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
+ poolid, pool->v.crush_ruleset, pool->v.type,
+ pool->v.size);
+ return NULL;
+ }
+
+ if (preferred >= 0)
+ pps = ceph_stable_mod(ps,
+ le32_to_cpu(pool->v.lpgp_num),
+ pool->lpgp_num_mask);
+ else
+ pps = ceph_stable_mod(ps,
+ le32_to_cpu(pool->v.pgp_num),
+ pool->pgp_num_mask);
+ pps += poolid;
+ *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
+ min_t(int, pool->v.size, *num),
+ preferred, osdmap->osd_weight);
+ return osds;
+}
+
+/*
+ * Return acting set for given pgid.
+ */
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+ int *acting)
+{
+ int rawosds[CEPH_PG_MAX_SIZE], *osds;
+ int i, o, num = CEPH_PG_MAX_SIZE;
+
+ osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+ if (!osds)
+ return -1;
+
+ /* primary is first up osd */
+ o = 0;
+ for (i = 0; i < num; i++)
+ if (ceph_osd_is_up(osdmap, osds[i]))
+ acting[o++] = osds[i];
+ return o;
+}
+
+/*
+ * Return primary osd for given pgid, or -1 if none.
+ */
+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+{
+ int rawosds[CEPH_PG_MAX_SIZE], *osds;
+ int i, num = CEPH_PG_MAX_SIZE;
+
+ osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+ if (!osds)
+ return -1;
+
+ /* primary is first up osd */
+ for (i = 0; i < num; i++)
+ if (ceph_osd_is_up(osdmap, osds[i]))
+ return osds[i];
+ return -1;
+}
+EXPORT_SYMBOL(ceph_calc_pg_primary);
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
new file mode 100644
index 00000000000..13cb409a7bb
--- /dev/null
+++ b/net/ceph/pagelist.c
@@ -0,0 +1,154 @@
+
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/ceph/pagelist.h>
+
+static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
+{
+ if (pl->mapped_tail) {
+ struct page *page = list_entry(pl->head.prev, struct page, lru);
+ kunmap(page);
+ pl->mapped_tail = NULL;
+ }
+}
+
+int ceph_pagelist_release(struct ceph_pagelist *pl)
+{
+ ceph_pagelist_unmap_tail(pl);
+ while (!list_empty(&pl->head)) {
+ struct page *page = list_first_entry(&pl->head, struct page,
+ lru);
+ list_del(&page->lru);
+ __free_page(page);
+ }
+ ceph_pagelist_free_reserve(pl);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_release);
+
+static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
+{
+ struct page *page;
+
+ if (!pl->num_pages_free) {
+ page = __page_cache_alloc(GFP_NOFS);
+ } else {
+ page = list_first_entry(&pl->free_list, struct page, lru);
+ list_del(&page->lru);
+ --pl->num_pages_free;
+ }
+ if (!page)
+ return -ENOMEM;
+ pl->room += PAGE_SIZE;
+ ceph_pagelist_unmap_tail(pl);
+ list_add_tail(&page->lru, &pl->head);
+ pl->mapped_tail = kmap(page);
+ return 0;
+}
+
+int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
+{
+ while (pl->room < len) {
+ size_t bit = pl->room;
+ int ret;
+
+ memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
+ buf, bit);
+ pl->length += bit;
+ pl->room -= bit;
+ buf += bit;
+ len -= bit;
+ ret = ceph_pagelist_addpage(pl);
+ if (ret)
+ return ret;
+ }
+
+ memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
+ pl->length += len;
+ pl->room -= len;
+ return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_append);
+
+/**
+ * Allocate enough pages for a pagelist to append the given amount
+ * of data without without allocating.
+ * Returns: 0 on success, -ENOMEM on error.
+ */
+int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)
+{
+ if (space <= pl->room)
+ return 0;
+ space -= pl->room;
+ space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT; /* conv to num pages */
+
+ while (space > pl->num_pages_free) {
+ struct page *page = __page_cache_alloc(GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ list_add_tail(&page->lru, &pl->free_list);
+ ++pl->num_pages_free;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_reserve);
+
+/**
+ * Free any pages that have been preallocated.
+ */
+int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
+{
+ while (!list_empty(&pl->free_list)) {
+ struct page *page = list_first_entry(&pl->free_list,
+ struct page, lru);
+ list_del(&page->lru);
+ __free_page(page);
+ --pl->num_pages_free;
+ }
+ BUG_ON(pl->num_pages_free);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_free_reserve);
+
+/**
+ * Create a truncation point.
+ */
+void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
+ struct ceph_pagelist_cursor *c)
+{
+ c->pl = pl;
+ c->page_lru = pl->head.prev;
+ c->room = pl->room;
+}
+EXPORT_SYMBOL(ceph_pagelist_set_cursor);
+
+/**
+ * Truncate a pagelist to the given point. Move extra pages to reserve.
+ * This won't sleep.
+ * Returns: 0 on success,
+ * -EINVAL if the pagelist doesn't match the trunc point pagelist
+ */
+int ceph_pagelist_truncate(struct ceph_pagelist *pl,
+ struct ceph_pagelist_cursor *c)
+{
+ struct page *page;
+
+ if (pl != c->pl)
+ return -EINVAL;
+ ceph_pagelist_unmap_tail(pl);
+ while (pl->head.prev != c->page_lru) {
+ page = list_entry(pl->head.prev, struct page, lru);
+ list_del(&page->lru); /* remove from pagelist */
+ list_add_tail(&page->lru, &pl->free_list); /* add to reserve */
+ ++pl->num_pages_free;
+ }
+ pl->room = c->room;
+ if (!list_empty(&pl->head)) {
+ page = list_entry(pl->head.prev, struct page, lru);
+ pl->mapped_tail = kmap(page);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_truncate);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
new file mode 100644
index 00000000000..54caf068715
--- /dev/null
+++ b/net/ceph/pagevec.c
@@ -0,0 +1,223 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+
+#include <linux/ceph/libceph.h>
+
+/*
+ * build a vector of user pages
+ */
+struct page **ceph_get_direct_page_vector(const char __user *data,
+ int num_pages,
+ loff_t off, size_t len)
+{
+ struct page **pages;
+ int rc;
+
+ pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ down_read(&current->mm->mmap_sem);
+ rc = get_user_pages(current, current->mm, (unsigned long)data,
+ num_pages, 0, 0, pages, NULL);
+ up_read(&current->mm->mmap_sem);
+ if (rc < 0)
+ goto fail;
+ return pages;
+
+fail:
+ kfree(pages);
+ return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(ceph_get_direct_page_vector);
+
+void ceph_put_page_vector(struct page **pages, int num_pages)
+{
+ int i;
+
+ for (i = 0; i < num_pages; i++)
+ put_page(pages[i]);
+ kfree(pages);
+}
+EXPORT_SYMBOL(ceph_put_page_vector);
+
+void ceph_release_page_vector(struct page **pages, int num_pages)
+{
+ int i;
+
+ for (i = 0; i < num_pages; i++)
+ __free_pages(pages[i], 0);
+ kfree(pages);
+}
+EXPORT_SYMBOL(ceph_release_page_vector);
+
+/*
+ * allocate a vector new pages
+ */
+struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
+{
+ struct page **pages;
+ int i;
+
+ pages = kmalloc(sizeof(*pages) * num_pages, flags);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+ for (i = 0; i < num_pages; i++) {
+ pages[i] = __page_cache_alloc(flags);
+ if (pages[i] == NULL) {
+ ceph_release_page_vector(pages, i);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+ return pages;
+}
+EXPORT_SYMBOL(ceph_alloc_page_vector);
+
+/*
+ * copy user data into a page vector
+ */
+int ceph_copy_user_to_page_vector(struct page **pages,
+ const char __user *data,
+ loff_t off, size_t len)
+{
+ int i = 0;
+ int po = off & ~PAGE_CACHE_MASK;
+ int left = len;
+ int l, bad;
+
+ while (left > 0) {
+ l = min_t(int, PAGE_CACHE_SIZE-po, left);
+ bad = copy_from_user(page_address(pages[i]) + po, data, l);
+ if (bad == l)
+ return -EFAULT;
+ data += l - bad;
+ left -= l - bad;
+ po += l - bad;
+ if (po == PAGE_CACHE_SIZE) {
+ po = 0;
+ i++;
+ }
+ }
+ return len;
+}
+EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
+
+int ceph_copy_to_page_vector(struct page **pages,
+ const char *data,
+ loff_t off, size_t len)
+{
+ int i = 0;
+ size_t po = off & ~PAGE_CACHE_MASK;
+ size_t left = len;
+ size_t l;
+
+ while (left > 0) {
+ l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+ memcpy(page_address(pages[i]) + po, data, l);
+ data += l;
+ left -= l;
+ po += l;
+ if (po == PAGE_CACHE_SIZE) {
+ po = 0;
+ i++;
+ }
+ }
+ return len;
+}
+EXPORT_SYMBOL(ceph_copy_to_page_vector);
+
+int ceph_copy_from_page_vector(struct page **pages,
+ char *data,
+ loff_t off, size_t len)
+{
+ int i = 0;
+ size_t po = off & ~PAGE_CACHE_MASK;
+ size_t left = len;
+ size_t l;
+
+ while (left > 0) {
+ l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+ memcpy(data, page_address(pages[i]) + po, l);
+ data += l;
+ left -= l;
+ po += l;
+ if (po == PAGE_CACHE_SIZE) {
+ po = 0;
+ i++;
+ }
+ }
+ return len;
+}
+EXPORT_SYMBOL(ceph_copy_from_page_vector);
+
+/*
+ * copy user data from a page vector into a user pointer
+ */
+int ceph_copy_page_vector_to_user(struct page **pages,
+ char __user *data,
+ loff_t off, size_t len)
+{
+ int i = 0;
+ int po = off & ~PAGE_CACHE_MASK;
+ int left = len;
+ int l, bad;
+
+ while (left > 0) {
+ l = min_t(int, left, PAGE_CACHE_SIZE-po);
+ bad = copy_to_user(data, page_address(pages[i]) + po, l);
+ if (bad == l)
+ return -EFAULT;
+ data += l - bad;
+ left -= l - bad;
+ if (po) {
+ po += l - bad;
+ if (po == PAGE_CACHE_SIZE)
+ po = 0;
+ }
+ i++;
+ }
+ return len;
+}
+EXPORT_SYMBOL(ceph_copy_page_vector_to_user);
+
+/*
+ * Zero an extent within a page vector. Offset is relative to the
+ * start of the first page.
+ */
+void ceph_zero_page_vector_range(int off, int len, struct page **pages)
+{
+ int i = off >> PAGE_CACHE_SHIFT;
+
+ off &= ~PAGE_CACHE_MASK;
+
+ dout("zero_page_vector_page %u~%u\n", off, len);
+
+ /* leading partial page? */
+ if (off) {
+ int end = min((int)PAGE_CACHE_SIZE, off + len);
+ dout("zeroing %d %p head from %d\n", i, pages[i],
+ (int)off);
+ zero_user_segment(pages[i], off, end);
+ len -= (end - off);
+ i++;
+ }
+ while (len >= PAGE_CACHE_SIZE) {
+ dout("zeroing %d %p len=%d\n", i, pages[i], len);
+ zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+ len -= PAGE_CACHE_SIZE;
+ i++;
+ }
+ /* trailing partial page? */
+ if (len) {
+ dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
+ zero_user_segment(pages[i], 0, len);
+ }
+}
+EXPORT_SYMBOL(ceph_zero_page_vector_range);
+
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 251997a9548..cd1e039c875 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -243,6 +243,7 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
unlock_sock_fast(sk, slow);
/* skb is now orphaned, can be freed outside of locked section */
+ trace_kfree_skb(skb, skb_free_datagram_locked);
__kfree_skb(skb);
}
EXPORT_SYMBOL(skb_free_datagram_locked);
@@ -746,13 +747,12 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
mask |= POLLERR;
if (sk->sk_shutdown & RCV_SHUTDOWN)
- mask |= POLLRDHUP;
+ mask |= POLLRDHUP | POLLIN | POLLRDNORM;
if (sk->sk_shutdown == SHUTDOWN_MASK)
mask |= POLLHUP;
/* readable? */
- if (!skb_queue_empty(&sk->sk_receive_queue) ||
- (sk->sk_shutdown & RCV_SHUTDOWN))
+ if (!skb_queue_empty(&sk->sk_receive_queue))
mask |= POLLIN | POLLRDNORM;
/* Connection-based need to check for termination and startup */
diff --git a/net/core/dev.c b/net/core/dev.c
index 1ae65439144..78b5a89b0f4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -128,7 +128,10 @@
#include <linux/jhash.h>
#include <linux/random.h>
#include <trace/events/napi.h>
+#include <trace/events/net.h>
+#include <trace/events/skb.h>
#include <linux/pci.h>
+#include <linux/inetdevice.h>
#include "net-sysfs.h"
@@ -371,6 +374,14 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
* --ANK (980803)
*/
+static inline struct list_head *ptype_head(const struct packet_type *pt)
+{
+ if (pt->type == htons(ETH_P_ALL))
+ return &ptype_all;
+ else
+ return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
+}
+
/**
* dev_add_pack - add packet handler
* @pt: packet type declaration
@@ -386,16 +397,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
void dev_add_pack(struct packet_type *pt)
{
- int hash;
+ struct list_head *head = ptype_head(pt);
- spin_lock_bh(&ptype_lock);
- if (pt->type == htons(ETH_P_ALL))
- list_add_rcu(&pt->list, &ptype_all);
- else {
- hash = ntohs(pt->type) & PTYPE_HASH_MASK;
- list_add_rcu(&pt->list, &ptype_base[hash]);
- }
- spin_unlock_bh(&ptype_lock);
+ spin_lock(&ptype_lock);
+ list_add_rcu(&pt->list, head);
+ spin_unlock(&ptype_lock);
}
EXPORT_SYMBOL(dev_add_pack);
@@ -414,15 +420,10 @@ EXPORT_SYMBOL(dev_add_pack);
*/
void __dev_remove_pack(struct packet_type *pt)
{
- struct list_head *head;
+ struct list_head *head = ptype_head(pt);
struct packet_type *pt1;
- spin_lock_bh(&ptype_lock);
-
- if (pt->type == htons(ETH_P_ALL))
- head = &ptype_all;
- else
- head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
+ spin_lock(&ptype_lock);
list_for_each_entry(pt1, head, list) {
if (pt == pt1) {
@@ -433,7 +434,7 @@ void __dev_remove_pack(struct packet_type *pt)
printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
out:
- spin_unlock_bh(&ptype_lock);
+ spin_unlock(&ptype_lock);
}
EXPORT_SYMBOL(__dev_remove_pack);
@@ -1484,8 +1485,9 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
skb_orphan(skb);
nf_reset(skb);
- if (!(dev->flags & IFF_UP) ||
- (skb->len > (dev->mtu + dev->hard_header_len))) {
+ if (unlikely(!(dev->flags & IFF_UP) ||
+ (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
+ atomic_long_inc(&dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
@@ -1553,21 +1555,56 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
* Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
* greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
*/
-void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
+int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
{
- unsigned int real_num = dev->real_num_tx_queues;
+ if (txq < 1 || txq > dev->num_tx_queues)
+ return -EINVAL;
- if (unlikely(txq > dev->num_tx_queues))
- ;
- else if (txq > real_num)
- dev->real_num_tx_queues = txq;
- else if (txq < real_num) {
- dev->real_num_tx_queues = txq;
- qdisc_reset_all_tx_gt(dev, txq);
+ if (dev->reg_state == NETREG_REGISTERED) {
+ ASSERT_RTNL();
+
+ if (txq < dev->real_num_tx_queues)
+ qdisc_reset_all_tx_gt(dev, txq);
}
+
+ dev->real_num_tx_queues = txq;
+ return 0;
}
EXPORT_SYMBOL(netif_set_real_num_tx_queues);
+#ifdef CONFIG_RPS
+/**
+ * netif_set_real_num_rx_queues - set actual number of RX queues used
+ * @dev: Network device
+ * @rxq: Actual number of RX queues
+ *
+ * This must be called either with the rtnl_lock held or before
+ * registration of the net device. Returns 0 on success, or a
+ * negative error code. If called before registration, it always
+ * succeeds.
+ */
+int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
+{
+ int rc;
+
+ if (rxq < 1 || rxq > dev->num_rx_queues)
+ return -EINVAL;
+
+ if (dev->reg_state == NETREG_REGISTERED) {
+ ASSERT_RTNL();
+
+ rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
+ rxq);
+ if (rc)
+ return rc;
+ }
+
+ dev->real_num_rx_queues = rxq;
+ return 0;
+}
+EXPORT_SYMBOL(netif_set_real_num_rx_queues);
+#endif
+
static inline void __netif_reschedule(struct Qdisc *q)
{
struct softnet_data *sd;
@@ -1659,7 +1696,12 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol)
static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
{
- if (can_checksum_protocol(dev->features, skb->protocol))
+ int features = dev->features;
+
+ if (vlan_tx_tag_present(skb))
+ features &= dev->vlan_features;
+
+ if (can_checksum_protocol(features, skb->protocol))
return true;
if (skb->protocol == htons(ETH_P_8021Q)) {
@@ -1758,6 +1800,16 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
__be16 type = skb->protocol;
int err;
+ if (type == htons(ETH_P_8021Q)) {
+ struct vlan_ethhdr *veh;
+
+ if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
+ return ERR_PTR(-EINVAL);
+
+ veh = (struct vlan_ethhdr *)skb->data;
+ type = veh->h_vlan_encapsulated_proto;
+ }
+
skb_reset_mac_header(skb);
skb->mac_len = skb->network_header - skb->mac_header;
__skb_pull(skb, skb->mac_len);
@@ -1902,14 +1954,14 @@ static int dev_gso_segment(struct sk_buff *skb)
/*
* Try to orphan skb early, right before transmission by the device.
- * We cannot orphan skb if tx timestamp is requested, since
- * drivers need to call skb_tstamp_tx() to send the timestamp.
+ * We cannot orphan skb if tx timestamp is requested or the sk-reference
+ * is needed on driver level for other reasons, e.g. see net/can/raw.c
*/
static inline void skb_orphan_try(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
- if (sk && !skb_tx(skb)->flags) {
+ if (sk && !skb_shinfo(skb)->tx_flags) {
/* skb_tx_hash() wont be able to get sk.
* We copy sk_hash into skb->rxhash
*/
@@ -1929,9 +1981,14 @@ static inline void skb_orphan_try(struct sk_buff *skb)
static inline int skb_needs_linearize(struct sk_buff *skb,
struct net_device *dev)
{
+ int features = dev->features;
+
+ if (skb->protocol == htons(ETH_P_8021Q) || vlan_tx_tag_present(skb))
+ features &= dev->vlan_features;
+
return skb_is_nonlinear(skb) &&
- ((skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
- (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
+ ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) ||
+ (skb_shinfo(skb)->nr_frags && (!(features & NETIF_F_SG) ||
illegal_highdma(dev, skb))));
}
@@ -1954,6 +2011,15 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
skb_orphan_try(skb);
+ if (vlan_tx_tag_present(skb) &&
+ !(dev->features & NETIF_F_HW_VLAN_TX)) {
+ skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
+ if (unlikely(!skb))
+ goto out;
+
+ skb->vlan_tci = 0;
+ }
+
if (netif_needs_gso(dev, skb)) {
if (unlikely(dev_gso_segment(skb)))
goto out_kfree_skb;
@@ -1978,6 +2044,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
}
rc = ops->ndo_start_xmit(skb, dev);
+ trace_net_dev_xmit(skb, rc);
if (rc == NETDEV_TX_OK)
txq_trans_update(txq);
return rc;
@@ -1998,6 +2065,7 @@ gso:
skb_dst_drop(nskb);
rc = ops->ndo_start_xmit(nskb, dev);
+ trace_net_dev_xmit(nskb, rc);
if (unlikely(rc != NETDEV_TX_OK)) {
if (rc & ~NETDEV_TX_MASK)
goto out_kfree_gso_skb;
@@ -2015,6 +2083,7 @@ out_kfree_gso_skb:
skb->destructor = DEV_GSO_CB(skb)->destructor;
out_kfree_skb:
kfree_skb(skb);
+out:
return rc;
}
@@ -2058,16 +2127,16 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
struct sk_buff *skb)
{
int queue_index;
- struct sock *sk = skb->sk;
+ const struct net_device_ops *ops = dev->netdev_ops;
- queue_index = sk_tx_queue_get(sk);
- if (queue_index < 0) {
- const struct net_device_ops *ops = dev->netdev_ops;
+ if (ops->ndo_select_queue) {
+ queue_index = ops->ndo_select_queue(dev, skb);
+ queue_index = dev_cap_txqueue(dev, queue_index);
+ } else {
+ struct sock *sk = skb->sk;
+ queue_index = sk_tx_queue_get(sk);
+ if (queue_index < 0) {
- if (ops->ndo_select_queue) {
- queue_index = ops->ndo_select_queue(dev, skb);
- queue_index = dev_cap_txqueue(dev, queue_index);
- } else {
queue_index = 0;
if (dev->real_num_tx_queues > 1)
queue_index = skb_tx_hash(dev, skb);
@@ -2143,6 +2212,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
return rc;
}
+static DEFINE_PER_CPU(int, xmit_recursion);
+#define RECURSION_LIMIT 3
+
/**
* dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
@@ -2186,6 +2258,7 @@ int dev_queue_xmit(struct sk_buff *skb)
#ifdef CONFIG_NET_CLS_ACT
skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
#endif
+ trace_net_dev_queue(skb);
if (q->enqueue) {
rc = __dev_xmit_skb(skb, q, dev, txq);
goto out;
@@ -2208,10 +2281,15 @@ int dev_queue_xmit(struct sk_buff *skb)
if (txq->xmit_lock_owner != cpu) {
+ if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
+ goto recursion_alert;
+
HARD_TX_LOCK(dev, txq, cpu);
if (!netif_tx_queue_stopped(txq)) {
+ __this_cpu_inc(xmit_recursion);
rc = dev_hard_start_xmit(skb, dev, txq);
+ __this_cpu_dec(xmit_recursion);
if (dev_xmit_complete(rc)) {
HARD_TX_UNLOCK(dev, txq);
goto out;
@@ -2223,7 +2301,9 @@ int dev_queue_xmit(struct sk_buff *skb)
"queue packet!\n", dev->name);
} else {
/* Recursion is detected! It is possible,
- * unfortunately */
+ * unfortunately
+ */
+recursion_alert:
if (net_ratelimit())
printk(KERN_CRIT "Dead loop on virtual device "
"%s, fix it urgently!\n", dev->name);
@@ -2259,69 +2339,44 @@ static inline void ____napi_schedule(struct softnet_data *sd,
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
-#ifdef CONFIG_RPS
-
-/* One global table that all flow-based protocols share. */
-struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
-EXPORT_SYMBOL(rps_sock_flow_table);
-
/*
- * get_rps_cpu is called from netif_receive_skb and returns the target
- * CPU from the RPS map of the receiving queue for a given skb.
- * rcu_read_lock must be held on entry.
+ * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
+ * and src/dst port numbers. Returns a non-zero hash number on success
+ * and 0 on failure.
*/
-static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
- struct rps_dev_flow **rflowp)
+__u32 __skb_get_rxhash(struct sk_buff *skb)
{
+ int nhoff, hash = 0, poff;
struct ipv6hdr *ip6;
struct iphdr *ip;
- struct netdev_rx_queue *rxqueue;
- struct rps_map *map;
- struct rps_dev_flow_table *flow_table;
- struct rps_sock_flow_table *sock_flow_table;
- int cpu = -1;
u8 ip_proto;
- u16 tcpu;
u32 addr1, addr2, ihl;
union {
u32 v32;
u16 v16[2];
} ports;
- if (skb_rx_queue_recorded(skb)) {
- u16 index = skb_get_rx_queue(skb);
- if (unlikely(index >= dev->num_rx_queues)) {
- WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
- "on queue %u, but number of RX queues is %u\n",
- dev->name, index, dev->num_rx_queues);
- goto done;
- }
- rxqueue = dev->_rx + index;
- } else
- rxqueue = dev->_rx;
-
- if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
- goto done;
-
- if (skb->rxhash)
- goto got_hash; /* Skip hash computation on packet header */
+ nhoff = skb_network_offset(skb);
switch (skb->protocol) {
case __constant_htons(ETH_P_IP):
- if (!pskb_may_pull(skb, sizeof(*ip)))
+ if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
goto done;
- ip = (struct iphdr *) skb->data;
- ip_proto = ip->protocol;
+ ip = (struct iphdr *) (skb->data + nhoff);
+ if (ip->frag_off & htons(IP_MF | IP_OFFSET))
+ ip_proto = 0;
+ else
+ ip_proto = ip->protocol;
addr1 = (__force u32) ip->saddr;
addr2 = (__force u32) ip->daddr;
ihl = ip->ihl;
break;
case __constant_htons(ETH_P_IPV6):
- if (!pskb_may_pull(skb, sizeof(*ip6)))
+ if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
goto done;
- ip6 = (struct ipv6hdr *) skb->data;
+ ip6 = (struct ipv6hdr *) (skb->data + nhoff);
ip_proto = ip6->nexthdr;
addr1 = (__force u32) ip6->saddr.s6_addr32[3];
addr2 = (__force u32) ip6->daddr.s6_addr32[3];
@@ -2330,33 +2385,81 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
default:
goto done;
}
- switch (ip_proto) {
- case IPPROTO_TCP:
- case IPPROTO_UDP:
- case IPPROTO_DCCP:
- case IPPROTO_ESP:
- case IPPROTO_AH:
- case IPPROTO_SCTP:
- case IPPROTO_UDPLITE:
- if (pskb_may_pull(skb, (ihl * 4) + 4)) {
- ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
+
+ ports.v32 = 0;
+ poff = proto_ports_offset(ip_proto);
+ if (poff >= 0) {
+ nhoff += ihl * 4 + poff;
+ if (pskb_may_pull(skb, nhoff + 4)) {
+ ports.v32 = * (__force u32 *) (skb->data + nhoff);
if (ports.v16[1] < ports.v16[0])
swap(ports.v16[0], ports.v16[1]);
- break;
}
- default:
- ports.v32 = 0;
- break;
}
/* get a consistent hash (same value on both flow directions) */
if (addr2 < addr1)
swap(addr1, addr2);
- skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
- if (!skb->rxhash)
- skb->rxhash = 1;
-got_hash:
+ hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
+ if (!hash)
+ hash = 1;
+
+done:
+ return hash;
+}
+EXPORT_SYMBOL(__skb_get_rxhash);
+
+#ifdef CONFIG_RPS
+
+/* One global table that all flow-based protocols share. */
+struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
+EXPORT_SYMBOL(rps_sock_flow_table);
+
+/*
+ * get_rps_cpu is called from netif_receive_skb and returns the target
+ * CPU from the RPS map of the receiving queue for a given skb.
+ * rcu_read_lock must be held on entry.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct rps_dev_flow **rflowp)
+{
+ struct netdev_rx_queue *rxqueue;
+ struct rps_map *map = NULL;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_sock_flow_table *sock_flow_table;
+ int cpu = -1;
+ u16 tcpu;
+
+ if (skb_rx_queue_recorded(skb)) {
+ u16 index = skb_get_rx_queue(skb);
+ if (unlikely(index >= dev->real_num_rx_queues)) {
+ WARN_ONCE(dev->real_num_rx_queues > 1,
+ "%s received packet on queue %u, but number "
+ "of RX queues is %u\n",
+ dev->name, index, dev->real_num_rx_queues);
+ goto done;
+ }
+ rxqueue = dev->_rx + index;
+ } else
+ rxqueue = dev->_rx;
+
+ if (rxqueue->rps_map) {
+ map = rcu_dereference(rxqueue->rps_map);
+ if (map && map->len == 1) {
+ tcpu = map->cpus[0];
+ if (cpu_online(tcpu))
+ cpu = tcpu;
+ goto done;
+ }
+ } else if (!rxqueue->rps_flow_table) {
+ goto done;
+ }
+
+ skb_reset_network_header(skb);
+ if (!skb_get_rxhash(skb))
+ goto done;
+
flow_table = rcu_dereference(rxqueue->rps_flow_table);
sock_flow_table = rcu_dereference(rps_sock_flow_table);
if (flow_table && sock_flow_table) {
@@ -2396,7 +2499,6 @@ got_hash:
}
}
- map = rcu_dereference(rxqueue->rps_map);
if (map) {
tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
@@ -2482,6 +2584,7 @@ enqueue:
local_irq_restore(flags);
+ atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
@@ -2512,6 +2615,7 @@ int netif_rx(struct sk_buff *skb)
if (netdev_tstamp_prequeue)
net_timestamp_check(skb);
+ trace_netif_rx(skb);
#ifdef CONFIG_RPS
{
struct rps_dev_flow voidflow, *rflow = &voidflow;
@@ -2571,6 +2675,7 @@ static void net_tx_action(struct softirq_action *h)
clist = clist->next;
WARN_ON(atomic_read(&skb->users));
+ trace_kfree_skb(skb, net_tx_action);
__kfree_skb(skb);
}
}
@@ -2636,11 +2741,10 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
* the ingress scheduler, you just cant add policies on ingress.
*
*/
-static int ing_filter(struct sk_buff *skb)
+static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
{
struct net_device *dev = skb->dev;
u32 ttl = G_TC_RTTL(skb->tc_verd);
- struct netdev_queue *rxq;
int result = TC_ACT_OK;
struct Qdisc *q;
@@ -2654,8 +2758,6 @@ static int ing_filter(struct sk_buff *skb)
skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
- rxq = &dev->rx_queue;
-
q = rxq->qdisc;
if (q != &noop_qdisc) {
spin_lock(qdisc_lock(q));
@@ -2671,7 +2773,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
struct packet_type **pt_prev,
int *ret, struct net_device *orig_dev)
{
- if (skb->dev->rx_queue.qdisc == &noop_qdisc)
+ struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
+
+ if (!rxq || rxq->qdisc == &noop_qdisc)
goto out;
if (*pt_prev) {
@@ -2679,7 +2783,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
*pt_prev = NULL;
}
- switch (ing_filter(skb)) {
+ switch (ing_filter(skb, rxq)) {
case TC_ACT_SHOT:
case TC_ACT_STOLEN:
kfree_skb(skb);
@@ -2692,33 +2796,6 @@ out:
}
#endif
-/*
- * netif_nit_deliver - deliver received packets to network taps
- * @skb: buffer
- *
- * This function is used to deliver incoming packets to network
- * taps. It should be used when the normal netif_receive_skb path
- * is bypassed, for example because of VLAN acceleration.
- */
-void netif_nit_deliver(struct sk_buff *skb)
-{
- struct packet_type *ptype;
-
- if (list_empty(&ptype_all))
- return;
-
- skb_reset_network_header(skb);
- skb_reset_transport_header(skb);
- skb->mac_len = skb->network_header - skb->mac_header;
-
- rcu_read_lock();
- list_for_each_entry_rcu(ptype, &ptype_all, list) {
- if (!ptype->dev || ptype->dev == skb->dev)
- deliver_skb(skb, ptype, skb->dev);
- }
- rcu_read_unlock();
-}
-
/**
* netdev_rx_handler_register - register receive handler
* @dev: device to register a handler for
@@ -2828,8 +2905,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
if (!netdev_tstamp_prequeue)
net_timestamp_check(skb);
- if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
- return NET_RX_SUCCESS;
+ trace_netif_receive_skb(skb);
/* if we've gotten here through NAPI, check netpoll */
if (netpoll_receive_skb(skb))
@@ -2843,8 +2919,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
* be delivered to pkt handlers that are exact matches. Also
* the deliver_no_wcard flag will be set. If packet handlers
* are sensitive to duplicate packets these skbs will need to
- * be dropped at the handler. The vlan accel path may have
- * already set the deliver_no_wcard flag.
+ * be dropped at the handler.
*/
null_or_orig = NULL;
orig_dev = skb->dev;
@@ -2903,6 +2978,18 @@ ncls:
goto out;
}
+ if (vlan_tx_tag_present(skb)) {
+ if (pt_prev) {
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = NULL;
+ }
+ if (vlan_hwaccel_do_receive(&skb)) {
+ ret = __netif_receive_skb(skb);
+ goto out;
+ } else if (unlikely(!skb))
+ goto out;
+ }
+
/*
* Make sure frames received on VLAN interfaces stacked on
* bonding interfaces still make their way to any base bonding
@@ -2930,6 +3017,7 @@ ncls:
if (pt_prev) {
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
} else {
+ atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
@@ -3050,7 +3138,7 @@ out:
return netif_receive_skb(skb);
}
-static void napi_gro_flush(struct napi_struct *napi)
+inline void napi_gro_flush(struct napi_struct *napi)
{
struct sk_buff *skb, *next;
@@ -3063,6 +3151,7 @@ static void napi_gro_flush(struct napi_struct *napi)
napi->gro_count = 0;
napi->gro_list = NULL;
}
+EXPORT_SYMBOL(napi_gro_flush);
enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
@@ -3077,7 +3166,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
goto normal;
- if (skb_is_gso(skb) || skb_has_frags(skb))
+ if (skb_is_gso(skb) || skb_has_frag_list(skb))
goto normal;
rcu_read_lock();
@@ -3143,7 +3232,7 @@ pull:
put_page(skb_shinfo(skb)->frags[0].page);
memmove(skb_shinfo(skb)->frags,
skb_shinfo(skb)->frags + 1,
- --skb_shinfo(skb)->nr_frags);
+ --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
}
}
@@ -3156,16 +3245,19 @@ normal:
}
EXPORT_SYMBOL(dev_gro_receive);
-static gro_result_t
+static inline gro_result_t
__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
struct sk_buff *p;
for (p = napi->gro_list; p; p = p->next) {
- NAPI_GRO_CB(p)->same_flow =
- (p->dev == skb->dev) &&
- !compare_ether_header(skb_mac_header(p),
+ unsigned long diffs;
+
+ diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
+ diffs |= p->vlan_tci ^ skb->vlan_tci;
+ diffs |= compare_ether_header(skb_mac_header(p),
skb_gro_mac_header(skb));
+ NAPI_GRO_CB(p)->same_flow = !diffs;
NAPI_GRO_CB(p)->flush = 0;
}
@@ -3218,14 +3310,14 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
}
EXPORT_SYMBOL(napi_gro_receive);
-void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
+static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
{
__skb_pull(skb, skb_headlen(skb));
skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
+ skb->vlan_tci = 0;
napi->skb = skb;
}
-EXPORT_SYMBOL(napi_reuse_skb);
struct sk_buff *napi_get_frags(struct napi_struct *napi)
{
@@ -4845,7 +4937,7 @@ static void rollback_registered_many(struct list_head *head)
dev = list_first_entry(head, struct net_device, unreg_list);
call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
- synchronize_net();
+ rcu_barrier();
list_for_each_entry(dev, head, unreg_list)
dev_put(dev);
@@ -4859,21 +4951,6 @@ static void rollback_registered(struct net_device *dev)
rollback_registered_many(&single);
}
-static void __netdev_init_queue_locks_one(struct net_device *dev,
- struct netdev_queue *dev_queue,
- void *_unused)
-{
- spin_lock_init(&dev_queue->_xmit_lock);
- netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
- dev_queue->xmit_lock_owner = -1;
-}
-
-static void netdev_init_queue_locks(struct net_device *dev)
-{
- netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
- __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
-}
-
unsigned long netdev_fix_features(unsigned long features, const char *name)
{
/* Fix illegal SG+CSUM combinations. */
@@ -4941,6 +5018,66 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
}
EXPORT_SYMBOL(netif_stacked_transfer_operstate);
+static int netif_alloc_rx_queues(struct net_device *dev)
+{
+#ifdef CONFIG_RPS
+ unsigned int i, count = dev->num_rx_queues;
+ struct netdev_rx_queue *rx;
+
+ BUG_ON(count < 1);
+
+ rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
+ if (!rx) {
+ pr_err("netdev: Unable to allocate %u rx queues.\n", count);
+ return -ENOMEM;
+ }
+ dev->_rx = rx;
+
+ /*
+ * Set a pointer to first element in the array which holds the
+ * reference count.
+ */
+ for (i = 0; i < count; i++)
+ rx[i].first = rx;
+#endif
+ return 0;
+}
+
+static int netif_alloc_netdev_queues(struct net_device *dev)
+{
+ unsigned int count = dev->num_tx_queues;
+ struct netdev_queue *tx;
+
+ BUG_ON(count < 1);
+
+ tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
+ if (!tx) {
+ pr_err("netdev: Unable to allocate %u tx queues.\n",
+ count);
+ return -ENOMEM;
+ }
+ dev->_tx = tx;
+ return 0;
+}
+
+static void netdev_init_one_queue(struct net_device *dev,
+ struct netdev_queue *queue,
+ void *_unused)
+{
+ queue->dev = dev;
+
+ /* Initialize queue lock */
+ spin_lock_init(&queue->_xmit_lock);
+ netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
+ queue->xmit_lock_owner = -1;
+}
+
+static void netdev_init_queues(struct net_device *dev)
+{
+ netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
+ spin_lock_init(&dev->tx_global_lock);
+}
+
/**
* register_netdevice - register a network device
* @dev: device to register
@@ -4974,28 +5111,19 @@ int register_netdevice(struct net_device *dev)
spin_lock_init(&dev->addr_list_lock);
netdev_set_addr_lockdep_class(dev);
- netdev_init_queue_locks(dev);
dev->iflink = -1;
-#ifdef CONFIG_RPS
- if (!dev->num_rx_queues) {
- /*
- * Allocate a single RX queue if driver never called
- * alloc_netdev_mq
- */
+ ret = netif_alloc_rx_queues(dev);
+ if (ret)
+ goto out;
- dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
- if (!dev->_rx) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = netif_alloc_netdev_queues(dev);
+ if (ret)
+ goto out;
+
+ netdev_init_queues(dev);
- dev->_rx->first = dev->_rx;
- atomic_set(&dev->_rx->count, 1);
- dev->num_rx_queues = 1;
- }
-#endif
/* Init, if this function is available */
if (dev->netdev_ops->ndo_init) {
ret = dev->netdev_ops->ndo_init(dev);
@@ -5035,6 +5163,12 @@ int register_netdevice(struct net_device *dev)
if (dev->features & NETIF_F_SG)
dev->features |= NETIF_F_GSO;
+ /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
+ * vlan_dev_init() will do the dev->features check, so these features
+ * are enabled only if supported by underlying device.
+ */
+ dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
+
ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
ret = notifier_to_errno(ret);
if (ret)
@@ -5105,9 +5239,6 @@ int init_dummy_netdev(struct net_device *dev)
*/
dev->reg_state = NETREG_DUMMY;
- /* initialize the ref count */
- atomic_set(&dev->refcnt, 1);
-
/* NAPI wants this */
INIT_LIST_HEAD(&dev->napi_list);
@@ -5115,6 +5246,11 @@ int init_dummy_netdev(struct net_device *dev)
set_bit(__LINK_STATE_PRESENT, &dev->state);
set_bit(__LINK_STATE_START, &dev->state);
+ /* Note : We dont allocate pcpu_refcnt for dummy devices,
+ * because users of this 'device' dont need to change
+ * its refcount.
+ */
+
return 0;
}
EXPORT_SYMBOL_GPL(init_dummy_netdev);
@@ -5156,6 +5292,16 @@ out:
}
EXPORT_SYMBOL(register_netdev);
+int netdev_refcnt_read(const struct net_device *dev)
+{
+ int i, refcnt = 0;
+
+ for_each_possible_cpu(i)
+ refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
+ return refcnt;
+}
+EXPORT_SYMBOL(netdev_refcnt_read);
+
/*
* netdev_wait_allrefs - wait until all references are gone.
*
@@ -5170,11 +5316,14 @@ EXPORT_SYMBOL(register_netdev);
static void netdev_wait_allrefs(struct net_device *dev)
{
unsigned long rebroadcast_time, warning_time;
+ int refcnt;
linkwatch_forget_dev(dev);
rebroadcast_time = warning_time = jiffies;
- while (atomic_read(&dev->refcnt) != 0) {
+ refcnt = netdev_refcnt_read(dev);
+
+ while (refcnt != 0) {
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
rtnl_lock();
@@ -5201,11 +5350,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
msleep(250);
+ refcnt = netdev_refcnt_read(dev);
+
if (time_after(jiffies, warning_time + 10 * HZ)) {
printk(KERN_EMERG "unregister_netdevice: "
"waiting for %s to become free. Usage "
"count = %d\n",
- dev->name, atomic_read(&dev->refcnt));
+ dev->name, refcnt);
warning_time = jiffies;
}
}
@@ -5263,8 +5414,8 @@ void netdev_run_todo(void)
netdev_wait_allrefs(dev);
/* paranoia */
- BUG_ON(atomic_read(&dev->refcnt));
- WARN_ON(dev->ip_ptr);
+ BUG_ON(netdev_refcnt_read(dev));
+ WARN_ON(rcu_dereference_raw(dev->ip_ptr));
WARN_ON(dev->ip6_ptr);
WARN_ON(dev->dn_ptr);
@@ -5342,30 +5493,34 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
if (ops->ndo_get_stats64) {
memset(storage, 0, sizeof(*storage));
- return ops->ndo_get_stats64(dev, storage);
- }
- if (ops->ndo_get_stats) {
+ ops->ndo_get_stats64(dev, storage);
+ } else if (ops->ndo_get_stats) {
netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
- return storage;
+ } else {
+ netdev_stats_to_stats64(storage, &dev->stats);
+ dev_txq_stats_fold(dev, storage);
}
- netdev_stats_to_stats64(storage, &dev->stats);
- dev_txq_stats_fold(dev, storage);
+ storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
return storage;
}
EXPORT_SYMBOL(dev_get_stats);
-static void netdev_init_one_queue(struct net_device *dev,
- struct netdev_queue *queue,
- void *_unused)
+struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
{
- queue->dev = dev;
-}
+ struct netdev_queue *queue = dev_ingress_queue(dev);
-static void netdev_init_queues(struct net_device *dev)
-{
- netdev_init_one_queue(dev, &dev->rx_queue, NULL);
- netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
- spin_lock_init(&dev->tx_global_lock);
+#ifdef CONFIG_NET_CLS_ACT
+ if (queue)
+ return queue;
+ queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+ if (!queue)
+ return NULL;
+ netdev_init_one_queue(dev, queue, NULL);
+ queue->qdisc = &noop_qdisc;
+ queue->qdisc_sleeping = &noop_qdisc;
+ rcu_assign_pointer(dev->ingress_queue, queue);
+#endif
+ return queue;
}
/**
@@ -5382,17 +5537,18 @@ static void netdev_init_queues(struct net_device *dev)
struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
void (*setup)(struct net_device *), unsigned int queue_count)
{
- struct netdev_queue *tx;
struct net_device *dev;
size_t alloc_size;
struct net_device *p;
-#ifdef CONFIG_RPS
- struct netdev_rx_queue *rx;
- int i;
-#endif
BUG_ON(strlen(name) >= sizeof(dev->name));
+ if (queue_count < 1) {
+ pr_err("alloc_netdev: Unable to allocate device "
+ "with zero queues.\n");
+ return NULL;
+ }
+
alloc_size = sizeof(struct net_device);
if (sizeof_priv) {
/* ensure 32-byte alignment of private area */
@@ -5408,55 +5564,31 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
return NULL;
}
- tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
- if (!tx) {
- printk(KERN_ERR "alloc_netdev: Unable to allocate "
- "tx qdiscs.\n");
- goto free_p;
- }
-
-#ifdef CONFIG_RPS
- rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
- if (!rx) {
- printk(KERN_ERR "alloc_netdev: Unable to allocate "
- "rx queues.\n");
- goto free_tx;
- }
-
- atomic_set(&rx->count, queue_count);
-
- /*
- * Set a pointer to first element in the array which holds the
- * reference count.
- */
- for (i = 0; i < queue_count; i++)
- rx[i].first = rx;
-#endif
-
dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p;
+ dev->pcpu_refcnt = alloc_percpu(int);
+ if (!dev->pcpu_refcnt)
+ goto free_p;
+
if (dev_addr_init(dev))
- goto free_rx;
+ goto free_pcpu;
dev_mc_init(dev);
dev_uc_init(dev);
dev_net_set(dev, &init_net);
- dev->_tx = tx;
dev->num_tx_queues = queue_count;
dev->real_num_tx_queues = queue_count;
#ifdef CONFIG_RPS
- dev->_rx = rx;
dev->num_rx_queues = queue_count;
+ dev->real_num_rx_queues = queue_count;
#endif
dev->gso_max_size = GSO_MAX_SIZE;
- netdev_init_queues(dev);
-
INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
dev->ethtool_ntuple_list.count = 0;
INIT_LIST_HEAD(&dev->napi_list);
@@ -5467,12 +5599,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
strcpy(dev->name, name);
return dev;
-free_rx:
-#ifdef CONFIG_RPS
- kfree(rx);
-free_tx:
-#endif
- kfree(tx);
+free_pcpu:
+ free_percpu(dev->pcpu_refcnt);
free_p:
kfree(p);
return NULL;
@@ -5495,6 +5623,8 @@ void free_netdev(struct net_device *dev)
kfree(dev->_tx);
+ kfree(rcu_dereference_raw(dev->ingress_queue));
+
/* Flush device addresses */
dev_addr_flush(dev);
@@ -5504,6 +5634,9 @@ void free_netdev(struct net_device *dev)
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
+ free_percpu(dev->pcpu_refcnt);
+ dev->pcpu_refcnt = NULL;
+
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
kfree((char *)dev - dev->padded);
@@ -5658,6 +5791,10 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
/* Notify protocols, that we are about to destroy
this device. They should clean all the things.
+
+ Note that dev->reg_state stays at NETREG_REGISTERED.
+ This is wanted because this way 8021q and macvlan know
+ the device is just moving and can keep their slaves up.
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
diff --git a/net/core/dst.c b/net/core/dst.c
index 6c41b1fac3d..8abe628b79f 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -168,7 +168,7 @@ void *dst_alloc(struct dst_ops *ops)
{
struct dst_entry *dst;
- if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) {
+ if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
if (ops->gc(ops))
return NULL;
}
@@ -183,7 +183,7 @@ void *dst_alloc(struct dst_ops *ops)
#if RT_CACHE_DEBUG >= 2
atomic_inc(&dst_total);
#endif
- atomic_inc(&ops->entries);
+ dst_entries_add(ops, 1);
return dst;
}
EXPORT_SYMBOL(dst_alloc);
@@ -228,15 +228,15 @@ again:
child = dst->child;
dst->hh = NULL;
- if (hh && atomic_dec_and_test(&hh->hh_refcnt))
- kfree(hh);
+ if (hh)
+ hh_cache_put(hh);
if (neigh) {
dst->neighbour = NULL;
neigh_release(neigh);
}
- atomic_dec(&dst->ops->entries);
+ dst_entries_add(dst->ops, -1);
if (dst->ops->destroy)
dst->ops->destroy(dst);
@@ -271,13 +271,40 @@ void dst_release(struct dst_entry *dst)
if (dst) {
int newrefcnt;
- smp_mb__before_atomic_dec();
newrefcnt = atomic_dec_return(&dst->__refcnt);
WARN_ON(newrefcnt < 0);
+ if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) {
+ dst = dst_destroy(dst);
+ if (dst)
+ __dst_free(dst);
+ }
}
}
EXPORT_SYMBOL(dst_release);
+/**
+ * skb_dst_set_noref - sets skb dst, without a reference
+ * @skb: buffer
+ * @dst: dst entry
+ *
+ * Sets skb dst, assuming a reference was not taken on dst
+ * skb_dst_drop() should not dst_release() this dst
+ */
+void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
+{
+ WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ /* If dst not in cache, we must take a reference, because
+ * dst_release() will destroy dst as soon as its refcount becomes zero
+ */
+ if (unlikely(dst->flags & DST_NOCACHE)) {
+ dst_hold(dst);
+ skb_dst_set(skb, dst);
+ } else {
+ skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
+ }
+}
+EXPORT_SYMBOL(skb_dst_set_noref);
+
/* Dirty hack. We did it in 2.2 (in __dst_free),
* we have _very_ good reasons not to repeat
* this mistake in 2.3, but we have no choice
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 7a85367b3c2..956a9f4971c 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -19,6 +19,7 @@
#include <linux/netdevice.h>
#include <linux/bitops.h>
#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
#include <linux/slab.h>
/*
@@ -131,7 +132,8 @@ EXPORT_SYMBOL(ethtool_op_set_ufo);
* NETIF_F_xxx values in include/linux/netdevice.h
*/
static const u32 flags_dup_features =
- (ETH_FLAG_LRO | ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH);
+ (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE |
+ ETH_FLAG_RXHASH);
u32 ethtool_op_get_flags(struct net_device *dev)
{
@@ -205,18 +207,24 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
struct ethtool_drvinfo info;
const struct ethtool_ops *ops = dev->ethtool_ops;
- if (!ops->get_drvinfo)
- return -EOPNOTSUPP;
-
memset(&info, 0, sizeof(info));
info.cmd = ETHTOOL_GDRVINFO;
- ops->get_drvinfo(dev, &info);
+ if (ops && ops->get_drvinfo) {
+ ops->get_drvinfo(dev, &info);
+ } else if (dev->dev.parent && dev->dev.parent->driver) {
+ strlcpy(info.bus_info, dev_name(dev->dev.parent),
+ sizeof(info.bus_info));
+ strlcpy(info.driver, dev->dev.parent->driver->name,
+ sizeof(info.driver));
+ } else {
+ return -EOPNOTSUPP;
+ }
/*
* this method of obtaining string set info is deprecated;
* Use ETHTOOL_GSSET_INFO instead.
*/
- if (ops->get_sset_count) {
+ if (ops && ops->get_sset_count) {
int rc;
rc = ops->get_sset_count(dev, ETH_SS_TEST);
@@ -229,9 +237,9 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
if (rc >= 0)
info.n_priv_flags = rc;
}
- if (ops->get_regs_len)
+ if (ops && ops->get_regs_len)
info.regdump_len = ops->get_regs_len(dev);
- if (ops->get_eeprom_len)
+ if (ops && ops->get_eeprom_len)
info.eedump_len = ops->get_eeprom_len(dev);
if (copy_to_user(useraddr, &info, sizeof(info)))
@@ -348,7 +356,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
if (info.cmd == ETHTOOL_GRXCLSRLALL) {
if (info.rule_cnt > 0) {
if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32))
- rule_buf = kmalloc(info.rule_cnt * sizeof(u32),
+ rule_buf = kzalloc(info.rule_cnt * sizeof(u32),
GFP_USER);
if (!rule_buf)
return -ENOMEM;
@@ -397,7 +405,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
(KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index))
return -ENOMEM;
full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size;
- indir = kmalloc(full_size, GFP_USER);
+ indir = kzalloc(full_size, GFP_USER);
if (!indir)
return -ENOMEM;
@@ -479,6 +487,38 @@ static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list,
list->count++;
}
+/*
+ * ethtool does not (or did not) set masks for flow parameters that are
+ * not specified, so if both value and mask are 0 then this must be
+ * treated as equivalent to a mask with all bits set. Implement that
+ * here rather than in drivers.
+ */
+static void rx_ntuple_fix_masks(struct ethtool_rx_ntuple_flow_spec *fs)
+{
+ struct ethtool_tcpip4_spec *entry = &fs->h_u.tcp_ip4_spec;
+ struct ethtool_tcpip4_spec *mask = &fs->m_u.tcp_ip4_spec;
+
+ if (fs->flow_type != TCP_V4_FLOW &&
+ fs->flow_type != UDP_V4_FLOW &&
+ fs->flow_type != SCTP_V4_FLOW)
+ return;
+
+ if (!(entry->ip4src | mask->ip4src))
+ mask->ip4src = htonl(0xffffffff);
+ if (!(entry->ip4dst | mask->ip4dst))
+ mask->ip4dst = htonl(0xffffffff);
+ if (!(entry->psrc | mask->psrc))
+ mask->psrc = htons(0xffff);
+ if (!(entry->pdst | mask->pdst))
+ mask->pdst = htons(0xffff);
+ if (!(entry->tos | mask->tos))
+ mask->tos = 0xff;
+ if (!(fs->vlan_tag | fs->vlan_tag_mask))
+ fs->vlan_tag_mask = 0xffff;
+ if (!(fs->data | fs->data_mask))
+ fs->data_mask = 0xffffffffffffffffULL;
+}
+
static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev,
void __user *useraddr)
{
@@ -493,6 +533,8 @@ static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev,
if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
return -EFAULT;
+ rx_ntuple_fix_masks(&cmd.fs);
+
/*
* Cache filter in dev struct for GET operation only if
* the underlying driver doesn't have its own GET operation, and
@@ -538,7 +580,7 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr)
gstrings.len = ret;
- data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
+ data = kzalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
if (!data)
return -ENOMEM;
@@ -667,19 +709,19 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr)
break;
case IP_USER_FLOW:
sprintf(p, "\tSrc IP addr: 0x%x\n",
- fsc->fs.h_u.raw_ip4_spec.ip4src);
+ fsc->fs.h_u.usr_ip4_spec.ip4src);
p += ETH_GSTRING_LEN;
num_strings++;
sprintf(p, "\tSrc IP mask: 0x%x\n",
- fsc->fs.m_u.raw_ip4_spec.ip4src);
+ fsc->fs.m_u.usr_ip4_spec.ip4src);
p += ETH_GSTRING_LEN;
num_strings++;
sprintf(p, "\tDest IP addr: 0x%x\n",
- fsc->fs.h_u.raw_ip4_spec.ip4dst);
+ fsc->fs.h_u.usr_ip4_spec.ip4dst);
p += ETH_GSTRING_LEN;
num_strings++;
sprintf(p, "\tDest IP mask: 0x%x\n",
- fsc->fs.m_u.raw_ip4_spec.ip4dst);
+ fsc->fs.m_u.usr_ip4_spec.ip4dst);
p += ETH_GSTRING_LEN;
num_strings++;
break;
@@ -775,7 +817,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
if (regs.len > reglen)
regs.len = reglen;
- regbuf = kmalloc(reglen, GFP_USER);
+ regbuf = vmalloc(reglen);
if (!regbuf)
return -ENOMEM;
@@ -790,7 +832,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
ret = 0;
out:
- kfree(regbuf);
+ vfree(regbuf);
return ret;
}
@@ -1175,8 +1217,11 @@ static int ethtool_set_gro(struct net_device *dev, char __user *useraddr)
return -EFAULT;
if (edata.data) {
- if (!dev->ethtool_ops->get_rx_csum ||
- !dev->ethtool_ops->get_rx_csum(dev))
+ u32 rxcsum = dev->ethtool_ops->get_rx_csum ?
+ dev->ethtool_ops->get_rx_csum(dev) :
+ ethtool_op_get_rx_csum(dev);
+
+ if (!rxcsum)
return -EINVAL;
dev->features |= NETIF_F_GRO;
} else
@@ -1402,14 +1447,22 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
if (!dev || !netif_device_present(dev))
return -ENODEV;
- if (!dev->ethtool_ops)
- return -EOPNOTSUPP;
-
if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
return -EFAULT;
+ if (!dev->ethtool_ops) {
+ /* ETHTOOL_GDRVINFO does not require any driver support.
+ * It is also unprivileged and does not change anything,
+ * so we can take a shortcut to it. */
+ if (ethcmd == ETHTOOL_GDRVINFO)
+ return ethtool_get_drvinfo(dev, useraddr);
+ else
+ return -EOPNOTSUPP;
+ }
+
/* Allow some commands to be done by anyone */
switch (ethcmd) {
+ case ETHTOOL_GSET:
case ETHTOOL_GDRVINFO:
case ETHTOOL_GMSGLVL:
case ETHTOOL_GCOALESCE:
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 42e84e08a1b..1bc3f253ba6 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -144,7 +144,7 @@ fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net)
}
EXPORT_SYMBOL_GPL(fib_rules_register);
-void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
+static void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
{
struct fib_rule *rule, *tmp;
@@ -153,7 +153,6 @@ void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
fib_rule_put(rule);
}
}
-EXPORT_SYMBOL_GPL(fib_rules_cleanup_ops);
static void fib_rules_put_rcu(struct rcu_head *head)
{
@@ -182,7 +181,8 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
{
int ret = 0;
- if (rule->iifindex && (rule->iifindex != fl->iif))
+ if (rule->iifindex && (rule->iifindex != fl->iif) &&
+ !(fl->flags & FLOWI_FLAG_MATCH_ANY_IIF))
goto out;
if (rule->oifindex && (rule->oifindex != fl->oif))
@@ -225,9 +225,12 @@ jumped:
err = ops->action(rule, fl, flags, arg);
if (err != -EAGAIN) {
- fib_rule_get(rule);
- arg->rule = rule;
- goto out;
+ if ((arg->flags & FIB_LOOKUP_NOREF) ||
+ likely(atomic_inc_not_zero(&rule->refcnt))) {
+ arg->rule = rule;
+ goto out;
+ }
+ break;
}
}
@@ -491,7 +494,6 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
}
}
- synchronize_rcu();
notify_rule_change(RTM_DELRULE, rule, ops, nlh,
NETLINK_CB(skb).pid);
fib_rule_put(rule);
diff --git a/net/core/filter.c b/net/core/filter.c
index 52b051f82a0..7adf5035291 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -638,10 +638,9 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
return err;
}
- rcu_read_lock_bh();
- old_fp = rcu_dereference_bh(sk->sk_filter);
+ old_fp = rcu_dereference_protected(sk->sk_filter,
+ sock_owned_by_user(sk));
rcu_assign_pointer(sk->sk_filter, fp);
- rcu_read_unlock_bh();
if (old_fp)
sk_filter_delayed_uncharge(sk, old_fp);
@@ -654,14 +653,13 @@ int sk_detach_filter(struct sock *sk)
int ret = -ENOENT;
struct sk_filter *filter;
- rcu_read_lock_bh();
- filter = rcu_dereference_bh(sk->sk_filter);
+ filter = rcu_dereference_protected(sk->sk_filter,
+ sock_owned_by_user(sk));
if (filter) {
rcu_assign_pointer(sk->sk_filter, NULL);
sk_filter_delayed_uncharge(sk, filter);
ret = 0;
}
- rcu_read_unlock_bh();
return ret;
}
EXPORT_SYMBOL_GPL(sk_detach_filter);
diff --git a/net/core/flow.c b/net/core/flow.c
index f67dcbfe54e..127c8a7ffd6 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -53,8 +53,7 @@ struct flow_flush_info {
struct flow_cache {
u32 hash_shift;
- unsigned long order;
- struct flow_cache_percpu *percpu;
+ struct flow_cache_percpu __percpu *percpu;
struct notifier_block hotcpu_notifier;
int low_watermark;
int high_watermark;
@@ -64,7 +63,7 @@ struct flow_cache {
atomic_t flow_cache_genid = ATOMIC_INIT(0);
EXPORT_SYMBOL(flow_cache_genid);
static struct flow_cache flow_cache_global;
-static struct kmem_cache *flow_cachep;
+static struct kmem_cache *flow_cachep __read_mostly;
static DEFINE_SPINLOCK(flow_cache_gc_lock);
static LIST_HEAD(flow_cache_gc_list);
@@ -177,15 +176,11 @@ static u32 flow_hash_code(struct flow_cache *fc,
{
u32 *k = (u32 *) key;
- return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
- & (flow_cache_hash_size(fc) - 1));
+ return jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
+ & (flow_cache_hash_size(fc) - 1);
}
-#if (BITS_PER_LONG == 64)
-typedef u64 flow_compare_t;
-#else
-typedef u32 flow_compare_t;
-#endif
+typedef unsigned long flow_compare_t;
/* I hear what you're saying, use memcmp. But memcmp cannot make
* important assumptions that we can here, such as alignment and
@@ -357,62 +352,73 @@ void flow_cache_flush(void)
put_online_cpus();
}
-static void __init flow_cache_cpu_prepare(struct flow_cache *fc,
- struct flow_cache_percpu *fcp)
+static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
{
- fcp->hash_table = (struct hlist_head *)
- __get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order);
- if (!fcp->hash_table)
- panic("NET: failed to allocate flow cache order %lu\n", fc->order);
+ struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
+ size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc);
- fcp->hash_rnd_recalc = 1;
- fcp->hash_count = 0;
- tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
+ if (!fcp->hash_table) {
+ fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
+ if (!fcp->hash_table) {
+ pr_err("NET: failed to allocate flow cache sz %zu\n", sz);
+ return -ENOMEM;
+ }
+ fcp->hash_rnd_recalc = 1;
+ fcp->hash_count = 0;
+ tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
+ }
+ return 0;
}
-static int flow_cache_cpu(struct notifier_block *nfb,
+static int __cpuinit flow_cache_cpu(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
- int cpu = (unsigned long) hcpu;
+ int res, cpu = (unsigned long) hcpu;
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ res = flow_cache_cpu_prepare(fc, cpu);
+ if (res)
+ return notifier_from_errno(res);
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
__flow_cache_shrink(fc, fcp, 0);
+ break;
+ }
return NOTIFY_OK;
}
-static int flow_cache_init(struct flow_cache *fc)
+static int __init flow_cache_init(struct flow_cache *fc)
{
- unsigned long order;
int i;
fc->hash_shift = 10;
fc->low_watermark = 2 * flow_cache_hash_size(fc);
fc->high_watermark = 4 * flow_cache_hash_size(fc);
- for (order = 0;
- (PAGE_SIZE << order) <
- (sizeof(struct hlist_head)*flow_cache_hash_size(fc));
- order++)
- /* NOTHING */;
- fc->order = order;
fc->percpu = alloc_percpu(struct flow_cache_percpu);
+ if (!fc->percpu)
+ return -ENOMEM;
- setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
- (unsigned long) fc);
- fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
- add_timer(&fc->rnd_timer);
-
- for_each_possible_cpu(i)
- flow_cache_cpu_prepare(fc, per_cpu_ptr(fc->percpu, i));
-
+ for_each_online_cpu(i) {
+ if (flow_cache_cpu_prepare(fc, i))
+ return -ENOMEM;
+ }
fc->hotcpu_notifier = (struct notifier_block){
.notifier_call = flow_cache_cpu,
};
register_hotcpu_notifier(&fc->hotcpu_notifier);
+ setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
+ (unsigned long) fc);
+ fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+ add_timer(&fc->rnd_timer);
+
return 0;
}
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 9fbe7f7429b..7c2373321b7 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -232,7 +232,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
est->last_packets = bstats->packets;
est->avpps = rate_est->pps<<10;
- spin_lock(&est_tree_lock);
+ spin_lock_bh(&est_tree_lock);
if (!elist[idx].timer.function) {
INIT_LIST_HEAD(&elist[idx].list);
setup_timer(&elist[idx].timer, est_timer, idx);
@@ -243,7 +243,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
list_add_rcu(&est->list, &elist[idx].list);
gen_add_node(est);
- spin_unlock(&est_tree_lock);
+ spin_unlock_bh(&est_tree_lock);
return 0;
}
@@ -270,18 +270,18 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
{
struct gen_estimator *e;
- spin_lock(&est_tree_lock);
+ spin_lock_bh(&est_tree_lock);
while ((e = gen_find_node(bstats, rate_est))) {
rb_erase(&e->node, &est_root);
- write_lock_bh(&est_lock);
+ write_lock(&est_lock);
e->bstats = NULL;
- write_unlock_bh(&est_lock);
+ write_unlock(&est_lock);
list_del_rcu(&e->list);
call_rcu(&e->e_rcu, __gen_kill_estimator);
}
- spin_unlock(&est_tree_lock);
+ spin_unlock_bh(&est_tree_lock);
}
EXPORT_SYMBOL(gen_kill_estimator);
@@ -320,9 +320,9 @@ bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
ASSERT_RTNL();
- spin_lock(&est_tree_lock);
+ spin_lock_bh(&est_tree_lock);
res = gen_find_node(bstats, rate_est) != NULL;
- spin_unlock(&est_tree_lock);
+ spin_unlock_bh(&est_tree_lock);
return res;
}
diff --git a/net/core/iovec.c b/net/core/iovec.c
index 1cd98df412d..72aceb1fe4f 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -35,13 +35,16 @@
* in any case.
*/
-int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode)
+long verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode)
{
- int size, err, ct;
+ int size, ct;
+ long err;
if (m->msg_namelen) {
if (mode == VERIFY_READ) {
- err = move_addr_to_kernel(m->msg_name, m->msg_namelen,
+ void __user *namep;
+ namep = (void __user __force *) m->msg_name;
+ err = move_addr_to_kernel(namep, m->msg_namelen,
address);
if (err < 0)
return err;
@@ -52,7 +55,7 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address,
}
size = m->msg_iovlen * sizeof(struct iovec);
- if (copy_from_user(iov, m->msg_iov, size))
+ if (copy_from_user(iov, (void __user __force *) m->msg_iov, size))
return -EFAULT;
m->msg_iov = iov;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index a4e0a7482c2..8cc8f9a79db 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -122,7 +122,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh)
unsigned long neigh_rand_reach_time(unsigned long base)
{
- return (base ? (net_random() % base) + (base >> 1) : 0);
+ return base ? (net_random() % base) + (base >> 1) : 0;
}
EXPORT_SYMBOL(neigh_rand_reach_time);
@@ -131,15 +131,20 @@ static int neigh_forced_gc(struct neigh_table *tbl)
{
int shrunk = 0;
int i;
+ struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
write_lock_bh(&tbl->lock);
- for (i = 0; i <= tbl->hash_mask; i++) {
- struct neighbour *n, **np;
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ for (i = 0; i <= nht->hash_mask; i++) {
+ struct neighbour *n;
+ struct neighbour __rcu **np;
- np = &tbl->hash_buckets[i];
- while ((n = *np) != NULL) {
+ np = &nht->hash_buckets[i];
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock))) != NULL) {
/* Neighbour record may be discarded if:
* - nobody refers to it.
* - it is not permanent
@@ -147,7 +152,9 @@ static int neigh_forced_gc(struct neigh_table *tbl)
write_lock(&n->lock);
if (atomic_read(&n->refcnt) == 1 &&
!(n->nud_state & NUD_PERMANENT)) {
- *np = n->next;
+ rcu_assign_pointer(*np,
+ rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock)));
n->dead = 1;
shrunk = 1;
write_unlock(&n->lock);
@@ -199,16 +206,24 @@ static void pneigh_queue_purge(struct sk_buff_head *list)
static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
{
int i;
+ struct neigh_hash_table *nht;
- for (i = 0; i <= tbl->hash_mask; i++) {
- struct neighbour *n, **np = &tbl->hash_buckets[i];
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
- while ((n = *np) != NULL) {
+ for (i = 0; i <= nht->hash_mask; i++) {
+ struct neighbour *n;
+ struct neighbour __rcu **np = &nht->hash_buckets[i];
+
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock))) != NULL) {
if (dev && n->dev != dev) {
np = &n->next;
continue;
}
- *np = n->next;
+ rcu_assign_pointer(*np,
+ rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock)));
write_lock(&n->lock);
neigh_del_timer(n);
n->dead = 1;
@@ -279,6 +294,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
skb_queue_head_init(&n->arp_queue);
rwlock_init(&n->lock);
+ seqlock_init(&n->ha_lock);
n->updated = n->used = now;
n->nud_state = NUD_NONE;
n->output = neigh_blackhole;
@@ -297,64 +313,86 @@ out_entries:
goto out;
}
-static struct neighbour **neigh_hash_alloc(unsigned int entries)
+static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries)
{
- unsigned long size = entries * sizeof(struct neighbour *);
- struct neighbour **ret;
+ size_t size = entries * sizeof(struct neighbour *);
+ struct neigh_hash_table *ret;
+ struct neighbour **buckets;
- if (size <= PAGE_SIZE) {
- ret = kzalloc(size, GFP_ATOMIC);
- } else {
- ret = (struct neighbour **)
- __get_free_pages(GFP_ATOMIC|__GFP_ZERO, get_order(size));
+ ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
+ if (!ret)
+ return NULL;
+ if (size <= PAGE_SIZE)
+ buckets = kzalloc(size, GFP_ATOMIC);
+ else
+ buckets = (struct neighbour **)
+ __get_free_pages(GFP_ATOMIC | __GFP_ZERO,
+ get_order(size));
+ if (!buckets) {
+ kfree(ret);
+ return NULL;
}
+ rcu_assign_pointer(ret->hash_buckets, buckets);
+ ret->hash_mask = entries - 1;
+ get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd));
return ret;
}
-static void neigh_hash_free(struct neighbour **hash, unsigned int entries)
+static void neigh_hash_free_rcu(struct rcu_head *head)
{
- unsigned long size = entries * sizeof(struct neighbour *);
+ struct neigh_hash_table *nht = container_of(head,
+ struct neigh_hash_table,
+ rcu);
+ size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *);
+ struct neighbour **buckets = nht->hash_buckets;
if (size <= PAGE_SIZE)
- kfree(hash);
+ kfree(buckets);
else
- free_pages((unsigned long)hash, get_order(size));
+ free_pages((unsigned long)buckets, get_order(size));
+ kfree(nht);
}
-static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries)
+static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
+ unsigned long new_entries)
{
- struct neighbour **new_hash, **old_hash;
- unsigned int i, new_hash_mask, old_entries;
+ unsigned int i, hash;
+ struct neigh_hash_table *new_nht, *old_nht;
NEIGH_CACHE_STAT_INC(tbl, hash_grows);
BUG_ON(!is_power_of_2(new_entries));
- new_hash = neigh_hash_alloc(new_entries);
- if (!new_hash)
- return;
-
- old_entries = tbl->hash_mask + 1;
- new_hash_mask = new_entries - 1;
- old_hash = tbl->hash_buckets;
+ old_nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ new_nht = neigh_hash_alloc(new_entries);
+ if (!new_nht)
+ return old_nht;
- get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
- for (i = 0; i < old_entries; i++) {
+ for (i = 0; i <= old_nht->hash_mask; i++) {
struct neighbour *n, *next;
- for (n = old_hash[i]; n; n = next) {
- unsigned int hash_val = tbl->hash(n->primary_key, n->dev);
-
- hash_val &= new_hash_mask;
- next = n->next;
-
- n->next = new_hash[hash_val];
- new_hash[hash_val] = n;
+ for (n = rcu_dereference_protected(old_nht->hash_buckets[i],
+ lockdep_is_held(&tbl->lock));
+ n != NULL;
+ n = next) {
+ hash = tbl->hash(n->primary_key, n->dev,
+ new_nht->hash_rnd);
+
+ hash &= new_nht->hash_mask;
+ next = rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock));
+
+ rcu_assign_pointer(n->next,
+ rcu_dereference_protected(
+ new_nht->hash_buckets[hash],
+ lockdep_is_held(&tbl->lock)));
+ rcu_assign_pointer(new_nht->hash_buckets[hash], n);
}
}
- tbl->hash_buckets = new_hash;
- tbl->hash_mask = new_hash_mask;
- neigh_hash_free(old_hash, old_entries);
+ rcu_assign_pointer(tbl->nht, new_nht);
+ call_rcu(&old_nht->rcu, neigh_hash_free_rcu);
+ return new_nht;
}
struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
@@ -363,19 +401,26 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
struct neighbour *n;
int key_len = tbl->key_len;
u32 hash_val;
+ struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, lookups);
- read_lock_bh(&tbl->lock);
- hash_val = tbl->hash(pkey, dev);
- for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) {
+ rcu_read_lock_bh();
+ nht = rcu_dereference_bh(tbl->nht);
+ hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask;
+
+ for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
+ n != NULL;
+ n = rcu_dereference_bh(n->next)) {
if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
- neigh_hold(n);
+ if (!atomic_inc_not_zero(&n->refcnt))
+ n = NULL;
NEIGH_CACHE_STAT_INC(tbl, hits);
break;
}
}
- read_unlock_bh(&tbl->lock);
+
+ rcu_read_unlock_bh();
return n;
}
EXPORT_SYMBOL(neigh_lookup);
@@ -386,20 +431,27 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
struct neighbour *n;
int key_len = tbl->key_len;
u32 hash_val;
+ struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, lookups);
- read_lock_bh(&tbl->lock);
- hash_val = tbl->hash(pkey, NULL);
- for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) {
+ rcu_read_lock_bh();
+ nht = rcu_dereference_bh(tbl->nht);
+ hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) & nht->hash_mask;
+
+ for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
+ n != NULL;
+ n = rcu_dereference_bh(n->next)) {
if (!memcmp(n->primary_key, pkey, key_len) &&
net_eq(dev_net(n->dev), net)) {
- neigh_hold(n);
+ if (!atomic_inc_not_zero(&n->refcnt))
+ n = NULL;
NEIGH_CACHE_STAT_INC(tbl, hits);
break;
}
}
- read_unlock_bh(&tbl->lock);
+
+ rcu_read_unlock_bh();
return n;
}
EXPORT_SYMBOL(neigh_lookup_nodev);
@@ -411,6 +463,7 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
int key_len = tbl->key_len;
int error;
struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
+ struct neigh_hash_table *nht;
if (!n) {
rc = ERR_PTR(-ENOBUFS);
@@ -437,18 +490,24 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
write_lock_bh(&tbl->lock);
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
- if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1))
- neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
+ if (atomic_read(&tbl->entries) > (nht->hash_mask + 1))
+ nht = neigh_hash_grow(tbl, (nht->hash_mask + 1) << 1);
- hash_val = tbl->hash(pkey, dev) & tbl->hash_mask;
+ hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask;
if (n->parms->dead) {
rc = ERR_PTR(-EINVAL);
goto out_tbl_unlock;
}
- for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) {
+ for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
+ lockdep_is_held(&tbl->lock));
+ n1 != NULL;
+ n1 = rcu_dereference_protected(n1->next,
+ lockdep_is_held(&tbl->lock))) {
if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
neigh_hold(n1);
rc = n1;
@@ -456,10 +515,12 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
}
}
- n->next = tbl->hash_buckets[hash_val];
- tbl->hash_buckets[hash_val] = n;
n->dead = 0;
neigh_hold(n);
+ rcu_assign_pointer(n->next,
+ rcu_dereference_protected(nht->hash_buckets[hash_val],
+ lockdep_is_held(&tbl->lock)));
+ rcu_assign_pointer(nht->hash_buckets[hash_val], n);
write_unlock_bh(&tbl->lock);
NEIGH_PRINTK2("neigh %p is created.\n", n);
rc = n;
@@ -616,6 +677,12 @@ static inline void neigh_parms_put(struct neigh_parms *parms)
neigh_parms_destroy(parms);
}
+static void neigh_destroy_rcu(struct rcu_head *head)
+{
+ struct neighbour *neigh = container_of(head, struct neighbour, rcu);
+
+ kmem_cache_free(neigh->tbl->kmem_cachep, neigh);
+}
/*
* neighbour must already be out of the table;
*
@@ -643,8 +710,7 @@ void neigh_destroy(struct neighbour *neigh)
write_seqlock_bh(&hh->hh_lock);
hh->hh_output = neigh_blackhole;
write_sequnlock_bh(&hh->hh_lock);
- if (atomic_dec_and_test(&hh->hh_refcnt))
- kfree(hh);
+ hh_cache_put(hh);
}
skb_queue_purge(&neigh->arp_queue);
@@ -655,7 +721,7 @@ void neigh_destroy(struct neighbour *neigh)
NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);
atomic_dec(&neigh->tbl->entries);
- kmem_cache_free(neigh->tbl->kmem_cachep, neigh);
+ call_rcu(&neigh->rcu, neigh_destroy_rcu);
}
EXPORT_SYMBOL(neigh_destroy);
@@ -696,12 +762,16 @@ static void neigh_connect(struct neighbour *neigh)
static void neigh_periodic_work(struct work_struct *work)
{
struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
- struct neighbour *n, **np;
+ struct neighbour *n;
+ struct neighbour __rcu **np;
unsigned int i;
+ struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
write_lock_bh(&tbl->lock);
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
/*
* periodically recompute ReachableTime from random function
@@ -715,10 +785,11 @@ static void neigh_periodic_work(struct work_struct *work)
neigh_rand_reach_time(p->base_reachable_time);
}
- for (i = 0 ; i <= tbl->hash_mask; i++) {
- np = &tbl->hash_buckets[i];
+ for (i = 0 ; i <= nht->hash_mask; i++) {
+ np = &nht->hash_buckets[i];
- while ((n = *np) != NULL) {
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock))) != NULL) {
unsigned int state;
write_lock(&n->lock);
@@ -766,9 +837,9 @@ next_elt:
static __inline__ int neigh_max_probes(struct neighbour *n)
{
struct neigh_parms *p = n->parms;
- return (n->nud_state & NUD_PROBE ?
+ return (n->nud_state & NUD_PROBE) ?
p->ucast_probes :
- p->ucast_probes + p->app_probes + p->mcast_probes);
+ p->ucast_probes + p->app_probes + p->mcast_probes;
}
static void neigh_invalidate(struct neighbour *neigh)
@@ -945,7 +1016,7 @@ out_unlock_bh:
}
EXPORT_SYMBOL(__neigh_event_send);
-static void neigh_update_hhs(struct neighbour *neigh)
+static void neigh_update_hhs(const struct neighbour *neigh)
{
struct hh_cache *hh;
void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *)
@@ -1081,7 +1152,9 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
}
if (lladdr != neigh->ha) {
+ write_seqlock(&neigh->ha_lock);
memcpy(&neigh->ha, lladdr, dev->addr_len);
+ write_sequnlock(&neigh->ha_lock);
neigh_update_hhs(neigh);
if (!(new & NUD_CONNECTED))
neigh->confirmed = jiffies -
@@ -1139,44 +1212,73 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl,
}
EXPORT_SYMBOL(neigh_event_ns);
+static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst,
+ __be16 protocol)
+{
+ struct hh_cache *hh;
+
+ smp_rmb(); /* paired with smp_wmb() in neigh_hh_init() */
+ for (hh = n->hh; hh; hh = hh->hh_next) {
+ if (hh->hh_type == protocol) {
+ atomic_inc(&hh->hh_refcnt);
+ if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
+ hh_cache_put(hh);
+ return true;
+ }
+ }
+ return false;
+}
+
+/* called with read_lock_bh(&n->lock); */
static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
__be16 protocol)
{
struct hh_cache *hh;
struct net_device *dev = dst->dev;
- for (hh = n->hh; hh; hh = hh->hh_next)
- if (hh->hh_type == protocol)
- break;
+ if (likely(neigh_hh_lookup(n, dst, protocol)))
+ return;
- if (!hh && (hh = kzalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) {
- seqlock_init(&hh->hh_lock);
- hh->hh_type = protocol;
- atomic_set(&hh->hh_refcnt, 0);
- hh->hh_next = NULL;
+ /* slow path */
+ hh = kzalloc(sizeof(*hh), GFP_ATOMIC);
+ if (!hh)
+ return;
- if (dev->header_ops->cache(n, hh)) {
- kfree(hh);
- hh = NULL;
- } else {
- atomic_inc(&hh->hh_refcnt);
- hh->hh_next = n->hh;
- n->hh = hh;
- if (n->nud_state & NUD_CONNECTED)
- hh->hh_output = n->ops->hh_output;
- else
- hh->hh_output = n->ops->output;
- }
+ seqlock_init(&hh->hh_lock);
+ hh->hh_type = protocol;
+ atomic_set(&hh->hh_refcnt, 2);
+
+ if (dev->header_ops->cache(n, hh)) {
+ kfree(hh);
+ return;
}
- if (hh) {
- atomic_inc(&hh->hh_refcnt);
- dst->hh = hh;
+
+ write_lock_bh(&n->lock);
+
+ /* must check if another thread already did the insert */
+ if (neigh_hh_lookup(n, dst, protocol)) {
+ kfree(hh);
+ goto end;
}
+
+ if (n->nud_state & NUD_CONNECTED)
+ hh->hh_output = n->ops->hh_output;
+ else
+ hh->hh_output = n->ops->output;
+
+ hh->hh_next = n->hh;
+ smp_wmb(); /* paired with smp_rmb() in neigh_hh_lookup() */
+ n->hh = hh;
+
+ if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
+ hh_cache_put(hh);
+end:
+ write_unlock_bh(&n->lock);
}
/* This function can be used in contexts, where only old dev_queue_xmit
- worked, f.e. if you want to override normal output path (eql, shaper),
- but resolution is not made yet.
+ * worked, f.e. if you want to override normal output path (eql, shaper),
+ * but resolution is not made yet.
*/
int neigh_compat_output(struct sk_buff *skb)
@@ -1210,19 +1312,19 @@ int neigh_resolve_output(struct sk_buff *skb)
if (!neigh_event_send(neigh, skb)) {
int err;
struct net_device *dev = neigh->dev;
- if (dev->header_ops->cache && !dst->hh) {
- write_lock_bh(&neigh->lock);
- if (!dst->hh)
- neigh_hh_init(neigh, dst, dst->ops->protocol);
- err = dev_hard_header(skb, dev, ntohs(skb->protocol),
- neigh->ha, NULL, skb->len);
- write_unlock_bh(&neigh->lock);
- } else {
- read_lock_bh(&neigh->lock);
+ unsigned int seq;
+
+ if (dev->header_ops->cache &&
+ !dst->hh &&
+ !(dst->flags & DST_NOCACHE))
+ neigh_hh_init(neigh, dst, dst->ops->protocol);
+
+ do {
+ seq = read_seqbegin(&neigh->ha_lock);
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
neigh->ha, NULL, skb->len);
- read_unlock_bh(&neigh->lock);
- }
+ } while (read_seqretry(&neigh->ha_lock, seq));
+
if (err >= 0)
rc = neigh->ops->queue_xmit(skb);
else
@@ -1248,13 +1350,16 @@ int neigh_connected_output(struct sk_buff *skb)
struct dst_entry *dst = skb_dst(skb);
struct neighbour *neigh = dst->neighbour;
struct net_device *dev = neigh->dev;
+ unsigned int seq;
__skb_pull(skb, skb_network_offset(skb));
- read_lock_bh(&neigh->lock);
- err = dev_hard_header(skb, dev, ntohs(skb->protocol),
- neigh->ha, NULL, skb->len);
- read_unlock_bh(&neigh->lock);
+ do {
+ seq = read_seqbegin(&neigh->ha_lock);
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+ neigh->ha, NULL, skb->len);
+ } while (read_seqretry(&neigh->ha_lock, seq));
+
if (err >= 0)
err = neigh->ops->queue_xmit(skb);
else {
@@ -1436,17 +1541,14 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
panic("cannot create neighbour proc dir entry");
#endif
- tbl->hash_mask = 1;
- tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1);
+ tbl->nht = neigh_hash_alloc(8);
phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
- if (!tbl->hash_buckets || !tbl->phash_buckets)
+ if (!tbl->nht || !tbl->phash_buckets)
panic("cannot allocate neighbour cache hashes");
- get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
-
rwlock_init(&tbl->lock);
INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work);
schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time);
@@ -1486,8 +1588,7 @@ int neigh_table_clear(struct neigh_table *tbl)
struct neigh_table **tp;
/* It is not clean... Fix it to unload IPv6 module safely */
- cancel_delayed_work(&tbl->gc_work);
- flush_scheduled_work();
+ cancel_delayed_work_sync(&tbl->gc_work);
del_timer_sync(&tbl->proxy_timer);
pneigh_queue_purge(&tbl->proxy_queue);
neigh_ifdown(tbl, NULL);
@@ -1502,8 +1603,8 @@ int neigh_table_clear(struct neigh_table *tbl)
}
write_unlock(&neigh_tbl_lock);
- neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1);
- tbl->hash_buckets = NULL;
+ call_rcu(&tbl->nht->rcu, neigh_hash_free_rcu);
+ tbl->nht = NULL;
kfree(tbl->phash_buckets);
tbl->phash_buckets = NULL;
@@ -1529,6 +1630,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
struct net_device *dev = NULL;
int err = -EINVAL;
+ ASSERT_RTNL();
if (nlmsg_len(nlh) < sizeof(*ndm))
goto out;
@@ -1538,7 +1640,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
ndm = nlmsg_data(nlh);
if (ndm->ndm_ifindex) {
- dev = dev_get_by_index(net, ndm->ndm_ifindex);
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
if (dev == NULL) {
err = -ENODEV;
goto out;
@@ -1554,34 +1656,31 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
read_unlock(&neigh_tbl_lock);
if (nla_len(dst_attr) < tbl->key_len)
- goto out_dev_put;
+ goto out;
if (ndm->ndm_flags & NTF_PROXY) {
err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
- goto out_dev_put;
+ goto out;
}
if (dev == NULL)
- goto out_dev_put;
+ goto out;
neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
if (neigh == NULL) {
err = -ENOENT;
- goto out_dev_put;
+ goto out;
}
err = neigh_update(neigh, NULL, NUD_FAILED,
NEIGH_UPDATE_F_OVERRIDE |
NEIGH_UPDATE_F_ADMIN);
neigh_release(neigh);
- goto out_dev_put;
+ goto out;
}
read_unlock(&neigh_tbl_lock);
err = -EAFNOSUPPORT;
-out_dev_put:
- if (dev)
- dev_put(dev);
out:
return err;
}
@@ -1595,6 +1694,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
struct net_device *dev = NULL;
int err;
+ ASSERT_RTNL();
err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
if (err < 0)
goto out;
@@ -1605,14 +1705,14 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
ndm = nlmsg_data(nlh);
if (ndm->ndm_ifindex) {
- dev = dev_get_by_index(net, ndm->ndm_ifindex);
+ dev = __dev_get_by_index(net, ndm->ndm_ifindex);
if (dev == NULL) {
err = -ENODEV;
goto out;
}
if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len)
- goto out_dev_put;
+ goto out;
}
read_lock(&neigh_tbl_lock);
@@ -1626,7 +1726,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
read_unlock(&neigh_tbl_lock);
if (nla_len(tb[NDA_DST]) < tbl->key_len)
- goto out_dev_put;
+ goto out;
dst = nla_data(tb[NDA_DST]);
lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
@@ -1639,29 +1739,29 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
pn->flags = ndm->ndm_flags;
err = 0;
}
- goto out_dev_put;
+ goto out;
}
if (dev == NULL)
- goto out_dev_put;
+ goto out;
neigh = neigh_lookup(tbl, dst, dev);
if (neigh == NULL) {
if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
err = -ENOENT;
- goto out_dev_put;
+ goto out;
}
neigh = __neigh_lookup_errno(tbl, dst, dev);
if (IS_ERR(neigh)) {
err = PTR_ERR(neigh);
- goto out_dev_put;
+ goto out;
}
} else {
if (nlh->nlmsg_flags & NLM_F_EXCL) {
err = -EEXIST;
neigh_release(neigh);
- goto out_dev_put;
+ goto out;
}
if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
@@ -1674,15 +1774,11 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
} else
err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
neigh_release(neigh);
- goto out_dev_put;
+ goto out;
}
read_unlock(&neigh_tbl_lock);
err = -EAFNOSUPPORT;
-
-out_dev_put:
- if (dev)
- dev_put(dev);
out:
return err;
}
@@ -1748,18 +1844,22 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
unsigned long now = jiffies;
unsigned int flush_delta = now - tbl->last_flush;
unsigned int rand_delta = now - tbl->last_rand;
-
+ struct neigh_hash_table *nht;
struct ndt_config ndc = {
.ndtc_key_len = tbl->key_len,
.ndtc_entry_size = tbl->entry_size,
.ndtc_entries = atomic_read(&tbl->entries),
.ndtc_last_flush = jiffies_to_msecs(flush_delta),
.ndtc_last_rand = jiffies_to_msecs(rand_delta),
- .ndtc_hash_rnd = tbl->hash_rnd,
- .ndtc_hash_mask = tbl->hash_mask,
.ndtc_proxy_qlen = tbl->proxy_queue.qlen,
};
+ rcu_read_lock_bh();
+ nht = rcu_dereference_bh(tbl->nht);
+ ndc.ndtc_hash_rnd = nht->hash_rnd;
+ ndc.ndtc_hash_mask = nht->hash_mask;
+ rcu_read_unlock_bh();
+
NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc);
}
@@ -2056,10 +2156,14 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
read_lock_bh(&neigh->lock);
ndm->ndm_state = neigh->nud_state;
- if ((neigh->nud_state & NUD_VALID) &&
- nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, neigh->ha) < 0) {
- read_unlock_bh(&neigh->lock);
- goto nla_put_failure;
+ if (neigh->nud_state & NUD_VALID) {
+ char haddr[MAX_ADDR_LEN];
+
+ neigh_ha_snapshot(haddr, neigh, neigh->dev);
+ if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) {
+ read_unlock_bh(&neigh->lock);
+ goto nla_put_failure;
+ }
}
ci.ndm_used = jiffies_to_clock_t(now - neigh->used);
@@ -2087,18 +2191,23 @@ static void neigh_update_notify(struct neighbour *neigh)
static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
struct netlink_callback *cb)
{
- struct net * net = sock_net(skb->sk);
+ struct net *net = sock_net(skb->sk);
struct neighbour *n;
int rc, h, s_h = cb->args[1];
int idx, s_idx = idx = cb->args[2];
+ struct neigh_hash_table *nht;
- read_lock_bh(&tbl->lock);
- for (h = 0; h <= tbl->hash_mask; h++) {
+ rcu_read_lock_bh();
+ nht = rcu_dereference_bh(tbl->nht);
+
+ for (h = 0; h <= nht->hash_mask; h++) {
if (h < s_h)
continue;
if (h > s_h)
s_idx = 0;
- for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next) {
+ for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
+ n != NULL;
+ n = rcu_dereference_bh(n->next)) {
if (!net_eq(dev_net(n->dev), net))
continue;
if (idx < s_idx)
@@ -2107,17 +2216,16 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGH,
NLM_F_MULTI) <= 0) {
- read_unlock_bh(&tbl->lock);
rc = -1;
goto out;
}
- next:
+next:
idx++;
}
}
- read_unlock_bh(&tbl->lock);
rc = skb->len;
out:
+ rcu_read_unlock_bh();
cb->args[1] = h;
cb->args[2] = idx;
return rc;
@@ -2150,15 +2258,22 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie)
{
int chain;
+ struct neigh_hash_table *nht;
- read_lock_bh(&tbl->lock);
- for (chain = 0; chain <= tbl->hash_mask; chain++) {
+ rcu_read_lock_bh();
+ nht = rcu_dereference_bh(tbl->nht);
+
+ read_lock(&tbl->lock); /* avoid resizes */
+ for (chain = 0; chain <= nht->hash_mask; chain++) {
struct neighbour *n;
- for (n = tbl->hash_buckets[chain]; n; n = n->next)
+ for (n = rcu_dereference_bh(nht->hash_buckets[chain]);
+ n != NULL;
+ n = rcu_dereference_bh(n->next))
cb(n, cookie);
}
- read_unlock_bh(&tbl->lock);
+ read_unlock(&tbl->lock);
+ rcu_read_unlock_bh();
}
EXPORT_SYMBOL(neigh_for_each);
@@ -2167,18 +2282,25 @@ void __neigh_for_each_release(struct neigh_table *tbl,
int (*cb)(struct neighbour *))
{
int chain;
+ struct neigh_hash_table *nht;
- for (chain = 0; chain <= tbl->hash_mask; chain++) {
- struct neighbour *n, **np;
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ for (chain = 0; chain <= nht->hash_mask; chain++) {
+ struct neighbour *n;
+ struct neighbour __rcu **np;
- np = &tbl->hash_buckets[chain];
- while ((n = *np) != NULL) {
+ np = &nht->hash_buckets[chain];
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock))) != NULL) {
int release;
write_lock(&n->lock);
release = cb(n);
if (release) {
- *np = n->next;
+ rcu_assign_pointer(*np,
+ rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock)));
n->dead = 1;
} else
np = &n->next;
@@ -2196,13 +2318,13 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
{
struct neigh_seq_state *state = seq->private;
struct net *net = seq_file_net(seq);
- struct neigh_table *tbl = state->tbl;
+ struct neigh_hash_table *nht = state->nht;
struct neighbour *n = NULL;
int bucket = state->bucket;
state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
- for (bucket = 0; bucket <= tbl->hash_mask; bucket++) {
- n = tbl->hash_buckets[bucket];
+ for (bucket = 0; bucket <= nht->hash_mask; bucket++) {
+ n = rcu_dereference_bh(nht->hash_buckets[bucket]);
while (n) {
if (!net_eq(dev_net(n->dev), net))
@@ -2219,8 +2341,8 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
break;
if (n->nud_state & ~NUD_NOARP)
break;
- next:
- n = n->next;
+next:
+ n = rcu_dereference_bh(n->next);
}
if (n)
@@ -2237,14 +2359,14 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
{
struct neigh_seq_state *state = seq->private;
struct net *net = seq_file_net(seq);
- struct neigh_table *tbl = state->tbl;
+ struct neigh_hash_table *nht = state->nht;
if (state->neigh_sub_iter) {
void *v = state->neigh_sub_iter(state, n, pos);
if (v)
return n;
}
- n = n->next;
+ n = rcu_dereference_bh(n->next);
while (1) {
while (n) {
@@ -2261,17 +2383,17 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
if (n->nud_state & ~NUD_NOARP)
break;
- next:
- n = n->next;
+next:
+ n = rcu_dereference_bh(n->next);
}
if (n)
break;
- if (++state->bucket > tbl->hash_mask)
+ if (++state->bucket > nht->hash_mask)
break;
- n = tbl->hash_buckets[state->bucket];
+ n = rcu_dereference_bh(nht->hash_buckets[state->bucket]);
}
if (n && pos)
@@ -2369,7 +2491,7 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
}
void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
- __acquires(tbl->lock)
+ __acquires(rcu_bh)
{
struct neigh_seq_state *state = seq->private;
@@ -2377,7 +2499,8 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl
state->bucket = 0;
state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
- read_lock_bh(&tbl->lock);
+ rcu_read_lock_bh();
+ state->nht = rcu_dereference_bh(tbl->nht);
return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
}
@@ -2411,12 +2534,9 @@ out:
EXPORT_SYMBOL(neigh_seq_next);
void neigh_seq_stop(struct seq_file *seq, void *v)
- __releases(tbl->lock)
+ __releases(rcu_bh)
{
- struct neigh_seq_state *state = seq->private;
- struct neigh_table *tbl = state->tbl;
-
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock_bh();
}
EXPORT_SYMBOL(neigh_seq_stop);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index af4dfbadf2a..b143173e3eb 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -515,7 +515,7 @@ static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
return attribute->store(queue, attribute, buf, count);
}
-static struct sysfs_ops rx_queue_sysfs_ops = {
+static const struct sysfs_ops rx_queue_sysfs_ops = {
.show = rx_queue_attr_show,
.store = rx_queue_attr_store,
};
@@ -726,6 +726,7 @@ static struct kobj_type rx_queue_ktype = {
static int rx_queue_add_kobject(struct net_device *net, int index)
{
struct netdev_rx_queue *queue = net->_rx + index;
+ struct netdev_rx_queue *first = queue->first;
struct kobject *kobj = &queue->kobj;
int error = 0;
@@ -738,38 +739,43 @@ static int rx_queue_add_kobject(struct net_device *net, int index)
}
kobject_uevent(kobj, KOBJ_ADD);
+ atomic_inc(&first->count);
return error;
}
-static int rx_queue_register_kobjects(struct net_device *net)
+int
+net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
{
int i;
int error = 0;
- net->queues_kset = kset_create_and_add("queues",
- NULL, &net->dev.kobj);
- if (!net->queues_kset)
- return -ENOMEM;
- for (i = 0; i < net->num_rx_queues; i++) {
+ for (i = old_num; i < new_num; i++) {
error = rx_queue_add_kobject(net, i);
- if (error)
+ if (error) {
+ new_num = old_num;
break;
+ }
}
- if (error)
- while (--i >= 0)
- kobject_put(&net->_rx[i].kobj);
+ while (--i >= new_num)
+ kobject_put(&net->_rx[i].kobj);
return error;
}
-static void rx_queue_remove_kobjects(struct net_device *net)
+static int rx_queue_register_kobjects(struct net_device *net)
{
- int i;
+ net->queues_kset = kset_create_and_add("queues",
+ NULL, &net->dev.kobj);
+ if (!net->queues_kset)
+ return -ENOMEM;
+ return net_rx_queue_update_kobjects(net, 0, net->real_num_rx_queues);
+}
- for (i = 0; i < net->num_rx_queues; i++)
- kobject_put(&net->_rx[i].kobj);
+static void rx_queue_remove_kobjects(struct net_device *net)
+{
+ net_rx_queue_update_kobjects(net, net->real_num_rx_queues, 0);
kset_unregister(net->queues_kset);
}
#endif /* CONFIG_RPS */
@@ -789,12 +795,13 @@ static const void *net_netlink_ns(struct sock *sk)
return sock_net(sk);
}
-static struct kobj_ns_type_operations net_ns_type_operations = {
+struct kobj_ns_type_operations net_ns_type_operations = {
.type = KOBJ_NS_TYPE_NET,
.current_ns = net_current_ns,
.netlink_ns = net_netlink_ns,
.initial_ns = net_initial_ns,
};
+EXPORT_SYMBOL_GPL(net_ns_type_operations);
static void net_kobj_ns_exit(struct net *net)
{
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
index 805555e8b18..778e1571548 100644
--- a/net/core/net-sysfs.h
+++ b/net/core/net-sysfs.h
@@ -4,4 +4,8 @@
int netdev_kobject_init(void);
int netdev_register_kobject(struct net_device *);
void netdev_unregister_kobject(struct net_device *);
+#ifdef CONFIG_RPS
+int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
+#endif
+
#endif
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index afa6380ed88..7f1bb2aba03 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -26,6 +26,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/skb.h>
+#include <trace/events/net.h>
#include <trace/events/napi.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 537e01afd81..4e98ffac3af 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -288,11 +288,11 @@ static int netpoll_owner_active(struct net_device *dev)
return 0;
}
-void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
+ struct net_device *dev)
{
int status = NETDEV_TX_BUSY;
unsigned long tries;
- struct net_device *dev = np->dev;
const struct net_device_ops *ops = dev->netdev_ops;
/* It is up to the caller to keep npinfo alive. */
struct netpoll_info *npinfo = np->dev->npinfo;
@@ -346,7 +346,7 @@ void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
schedule_delayed_work(&npinfo->tx_work,0);
}
}
-EXPORT_SYMBOL(netpoll_send_skb);
+EXPORT_SYMBOL(netpoll_send_skb_on_dev);
void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
{
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 10a1ea72010..2c0df0f95b3 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -729,16 +729,14 @@ static int hex32_arg(const char __user *user_buffer, unsigned long maxlen,
*num = 0;
for (; i < maxlen; i++) {
+ int value;
char c;
*num <<= 4;
if (get_user(c, &user_buffer[i]))
return -EFAULT;
- if ((c >= '0') && (c <= '9'))
- *num |= c - '0';
- else if ((c >= 'a') && (c <= 'f'))
- *num |= c - 'a' + 10;
- else if ((c >= 'A') && (c <= 'F'))
- *num |= c - 'A' + 10;
+ value = hex_to_bin(c);
+ if (value >= 0)
+ *num |= value;
else
break;
}
@@ -3907,8 +3905,6 @@ static void __exit pg_cleanup(void)
{
struct pktgen_thread *t;
struct list_head *q, *n;
- wait_queue_head_t queue;
- init_waitqueue_head(&queue);
/* Stop all interfaces & threads */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f78d821bd93..8121268ddbd 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -299,14 +299,6 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
unregister_netdevice_many(&list_kill);
}
-void rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
-{
- rtnl_lock();
- __rtnl_kill_links(net, ops);
- rtnl_unlock();
-}
-EXPORT_SYMBOL_GPL(rtnl_kill_links);
-
/**
* __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
* @ops: struct rtnl_link_ops * to unregister
@@ -612,36 +604,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
{
- struct rtnl_link_stats64 a;
-
- a.rx_packets = b->rx_packets;
- a.tx_packets = b->tx_packets;
- a.rx_bytes = b->rx_bytes;
- a.tx_bytes = b->tx_bytes;
- a.rx_errors = b->rx_errors;
- a.tx_errors = b->tx_errors;
- a.rx_dropped = b->rx_dropped;
- a.tx_dropped = b->tx_dropped;
-
- a.multicast = b->multicast;
- a.collisions = b->collisions;
-
- a.rx_length_errors = b->rx_length_errors;
- a.rx_over_errors = b->rx_over_errors;
- a.rx_crc_errors = b->rx_crc_errors;
- a.rx_frame_errors = b->rx_frame_errors;
- a.rx_fifo_errors = b->rx_fifo_errors;
- a.rx_missed_errors = b->rx_missed_errors;
-
- a.tx_aborted_errors = b->tx_aborted_errors;
- a.tx_carrier_errors = b->tx_carrier_errors;
- a.tx_fifo_errors = b->tx_fifo_errors;
- a.tx_heartbeat_errors = b->tx_heartbeat_errors;
- a.tx_window_errors = b->tx_window_errors;
-
- a.rx_compressed = b->rx_compressed;
- a.tx_compressed = b->tx_compressed;
- memcpy(v, &a, sizeof(a));
+ memcpy(v, b, sizeof(*b));
}
/* All VF info */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3a2513f0d0c..104f8444754 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -202,8 +202,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
skb->data = data;
skb_reset_tail_pointer(skb);
skb->end = skb->tail + size;
- kmemcheck_annotate_bitfield(skb, flags1);
- kmemcheck_annotate_bitfield(skb, flags2);
#ifdef NET_SKBUFF_DATA_USES_OFFSET
skb->mac_header = ~0U;
#endif
@@ -249,10 +247,9 @@ EXPORT_SYMBOL(__alloc_skb);
struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
unsigned int length, gfp_t gfp_mask)
{
- int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
struct sk_buff *skb;
- skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
+ skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE);
if (likely(skb)) {
skb_reserve(skb, NET_SKB_PAD);
skb->dev = dev;
@@ -261,16 +258,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
}
EXPORT_SYMBOL(__netdev_alloc_skb);
-struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
-{
- int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
- struct page *page;
-
- page = alloc_pages_node(node, gfp_mask, 0);
- return page;
-}
-EXPORT_SYMBOL(__netdev_alloc_page);
-
void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
int size)
{
@@ -340,7 +327,7 @@ static void skb_release_data(struct sk_buff *skb)
put_page(skb_shinfo(skb)->frags[i].page);
}
- if (skb_has_frags(skb))
+ if (skb_has_frag_list(skb))
skb_drop_fraglist(skb);
kfree(skb->head);
@@ -466,6 +453,7 @@ void consume_skb(struct sk_buff *skb)
smp_rmb();
else if (likely(!atomic_dec_and_test(&skb->users)))
return;
+ trace_consume_skb(skb);
__kfree_skb(skb);
}
EXPORT_SYMBOL(consume_skb);
@@ -685,16 +673,10 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
- int headerlen = skb->data - skb->head;
- /*
- * Allocate the copy buffer
- */
- struct sk_buff *n;
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
- n = alloc_skb(skb->end + skb->data_len, gfp_mask);
-#else
- n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask);
-#endif
+ int headerlen = skb_headroom(skb);
+ unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len;
+ struct sk_buff *n = alloc_skb(size, gfp_mask);
+
if (!n)
return NULL;
@@ -726,20 +708,14 @@ EXPORT_SYMBOL(skb_copy);
struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
{
- /*
- * Allocate the copy buffer
- */
- struct sk_buff *n;
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
- n = alloc_skb(skb->end, gfp_mask);
-#else
- n = alloc_skb(skb->end - skb->head, gfp_mask);
-#endif
+ unsigned int size = skb_end_pointer(skb) - skb->head;
+ struct sk_buff *n = alloc_skb(size, gfp_mask);
+
if (!n)
goto out;
/* Set the data pointer */
- skb_reserve(n, skb->data - skb->head);
+ skb_reserve(n, skb_headroom(skb));
/* Set the tail pointer and length */
skb_put(n, skb_headlen(skb));
/* Copy the bytes */
@@ -759,7 +735,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
skb_shinfo(n)->nr_frags = i;
}
- if (skb_has_frags(skb)) {
+ if (skb_has_frag_list(skb)) {
skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
skb_clone_fraglist(n);
}
@@ -791,12 +767,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
{
int i;
u8 *data;
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
- int size = nhead + skb->end + ntail;
-#else
- int size = nhead + (skb->end - skb->head) + ntail;
-#endif
+ int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail;
long off;
+ bool fastpath;
BUG_ON(nhead < 0);
@@ -810,23 +783,36 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
goto nodata;
/* Copy only real data... and, alas, header. This should be
- * optimized for the cases when header is void. */
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
- memcpy(data + nhead, skb->head, skb->tail);
-#else
- memcpy(data + nhead, skb->head, skb->tail - skb->head);
-#endif
- memcpy(data + size, skb_end_pointer(skb),
+ * optimized for the cases when header is void.
+ */
+ memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
+
+ memcpy((struct skb_shared_info *)(data + size),
+ skb_shinfo(skb),
offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- get_page(skb_shinfo(skb)->frags[i].page);
+ /* Check if we can avoid taking references on fragments if we own
+ * the last reference on skb->head. (see skb_release_data())
+ */
+ if (!skb->cloned)
+ fastpath = true;
+ else {
+ int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1;
- if (skb_has_frags(skb))
- skb_clone_fraglist(skb);
+ fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
+ }
- skb_release_data(skb);
+ if (fastpath) {
+ kfree(skb->head);
+ } else {
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ get_page(skb_shinfo(skb)->frags[i].page);
+
+ if (skb_has_frag_list(skb))
+ skb_clone_fraglist(skb);
+ skb_release_data(skb);
+ }
off = (data + nhead) - skb->head;
skb->head = data;
@@ -1099,7 +1085,7 @@ drop_pages:
for (; i < nfrags; i++)
put_page(skb_shinfo(skb)->frags[i].page);
- if (skb_has_frags(skb))
+ if (skb_has_frag_list(skb))
skb_drop_fraglist(skb);
goto done;
}
@@ -1194,7 +1180,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
/* Optimization: no fragments, no reasons to preestimate
* size of pulled pages. Superb.
*/
- if (!skb_has_frags(skb))
+ if (!skb_has_frag_list(skb))
goto pull_pages;
/* Estimate size of pulled pages. */
@@ -2323,7 +2309,7 @@ next_skb:
st->frag_data = NULL;
}
- if (st->root_skb == st->cur_skb && skb_has_frags(st->root_skb)) {
+ if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
st->frag_idx = 0;
goto next_skb;
@@ -2573,6 +2559,10 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
__copy_skb_header(nskb, skb);
nskb->mac_len = skb->mac_len;
+ /* nskb and skb might have different headroom */
+ if (nskb->ip_summed == CHECKSUM_PARTIAL)
+ nskb->csum_start += skb_headroom(nskb) - headroom;
+
skb_reset_mac_header(nskb);
skb_set_network_header(nskb, skb->mac_len);
nskb->transport_header = (nskb->network_header +
@@ -2703,7 +2693,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
return -E2BIG;
headroom = skb_headroom(p);
- nskb = netdev_alloc_skb(p->dev, headroom + skb_gro_offset(p));
+ nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC);
if (unlikely(!nskb))
return -ENOMEM;
@@ -2889,7 +2879,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
return -ENOMEM;
/* Easy case. Most of packets will go this way. */
- if (!skb_has_frags(skb)) {
+ if (!skb_has_frag_list(skb)) {
/* A little of trouble, not enough of space for trailer.
* This should not happen, when stack is tuned to generate
* good frames. OK, on miss we reallocate and reserve even more
@@ -2924,7 +2914,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
if (skb1->next == NULL && tailbits) {
if (skb_shinfo(skb1)->nr_frags ||
- skb_has_frags(skb1) ||
+ skb_has_frag_list(skb1) ||
skb_tailroom(skb1) < tailbits)
ntail = tailbits + 128;
}
@@ -2933,7 +2923,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
skb_cloned(skb1) ||
ntail ||
skb_shinfo(skb1)->nr_frags ||
- skb_has_frags(skb1)) {
+ skb_has_frag_list(skb1)) {
struct sk_buff *skb2;
/* Fuck, we are miserable poor guys... */
@@ -3016,7 +3006,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
} else {
/*
* no hardware time stamps available,
- * so keep the skb_shared_tx and only
+ * so keep the shared tx_flags and only
* store software time stamp
*/
skb->tstamp = ktime_get_real();
diff --git a/net/core/sock.c b/net/core/sock.c
index b05b9b6ddb8..11db43632df 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1078,8 +1078,11 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
#ifdef CONFIG_CGROUPS
void sock_update_classid(struct sock *sk)
{
- u32 classid = task_cls_classid(current);
+ u32 classid;
+ rcu_read_lock(); /* doing current task, which cannot vanish. */
+ classid = task_cls_classid(current);
+ rcu_read_unlock();
if (classid && classid != sk->sk_classid)
sk->sk_classid = classid;
}
@@ -1351,9 +1354,9 @@ int sock_i_uid(struct sock *sk)
{
int uid;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
return uid;
}
EXPORT_SYMBOL(sock_i_uid);
@@ -1362,9 +1365,9 @@ unsigned long sock_i_ino(struct sock *sk)
{
unsigned long ino;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
return ino;
}
EXPORT_SYMBOL(sock_i_ino);
@@ -1557,6 +1560,8 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
EXPORT_SYMBOL(sock_alloc_send_skb);
static void __lock_sock(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+ __acquires(&sk->sk_lock.slock)
{
DEFINE_WAIT(wait);
@@ -1573,6 +1578,8 @@ static void __lock_sock(struct sock *sk)
}
static void __release_sock(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+ __acquires(&sk->sk_lock.slock)
{
struct sk_buff *skb = sk->sk_backlog.head;
diff --git a/net/core/stream.c b/net/core/stream.c
index d959e0f4152..f5df85dcd20 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -141,10 +141,10 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
sk->sk_write_pending++;
- sk_wait_event(sk, &current_timeo, !sk->sk_err &&
- !(sk->sk_shutdown & SEND_SHUTDOWN) &&
- sk_stream_memory_free(sk) &&
- vm_wait);
+ sk_wait_event(sk, &current_timeo, sk->sk_err ||
+ (sk->sk_shutdown & SEND_SHUTDOWN) ||
+ (sk_stream_memory_free(sk) &&
+ !vm_wait));
sk->sk_write_pending--;
if (vm_wait) {
diff --git a/net/core/utils.c b/net/core/utils.c
index f4185447053..5fea0ab2190 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -75,7 +75,7 @@ __be32 in_aton(const char *str)
str++;
}
}
- return(htonl(l));
+ return htonl(l);
}
EXPORT_SYMBOL(in_aton);
@@ -92,18 +92,19 @@ EXPORT_SYMBOL(in_aton);
static inline int xdigit2bin(char c, int delim)
{
+ int val;
+
if (c == delim || c == '\0')
return IN6PTON_DELIM;
if (c == ':')
return IN6PTON_COLON_MASK;
if (c == '.')
return IN6PTON_DOT;
- if (c >= '0' && c <= '9')
- return (IN6PTON_XDIGIT | IN6PTON_DIGIT| (c - '0'));
- if (c >= 'a' && c <= 'f')
- return (IN6PTON_XDIGIT | (c - 'a' + 10));
- if (c >= 'A' && c <= 'F')
- return (IN6PTON_XDIGIT | (c - 'A' + 10));
+
+ val = hex_to_bin(c);
+ if (val >= 0)
+ return val | IN6PTON_XDIGIT | (val < 10 ? IN6PTON_DIGIT : 0);
+
if (delim == -1)
return IN6PTON_DELIM;
return IN6PTON_UNKNOWN;
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index 6df6f8ac963..117fb093dca 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -62,22 +62,18 @@ struct ccid_operations {
void (*ccid_hc_tx_exit)(struct sock *sk);
void (*ccid_hc_rx_packet_recv)(struct sock *sk,
struct sk_buff *skb);
- int (*ccid_hc_rx_parse_options)(struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value);
+ int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt,
+ u8 opt, u8 *val, u8 len);
int (*ccid_hc_rx_insert_options)(struct sock *sk,
struct sk_buff *skb);
void (*ccid_hc_tx_packet_recv)(struct sock *sk,
struct sk_buff *skb);
- int (*ccid_hc_tx_parse_options)(struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value);
+ int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt,
+ u8 opt, u8 *val, u8 len);
int (*ccid_hc_tx_send_packet)(struct sock *sk,
struct sk_buff *skb);
void (*ccid_hc_tx_packet_sent)(struct sock *sk,
- int more, unsigned int len);
+ unsigned int len);
void (*ccid_hc_rx_get_info)(struct sock *sk,
struct tcp_info *info);
void (*ccid_hc_tx_get_info)(struct sock *sk,
@@ -148,10 +144,10 @@ static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
}
static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
- int more, unsigned int len)
+ unsigned int len)
{
if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
- ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len);
+ ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len);
}
static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
@@ -168,27 +164,31 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
}
+/**
+ * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver
+ * @pkt: type of packet that @opt appears on (RFC 4340, 5.1)
+ * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3)
+ * @val: value of @opt
+ * @len: length of @val in bytes
+ */
static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value)
+ u8 pkt, u8 opt, u8 *val, u8 len)
{
- int rc = 0;
- if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL)
- rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx,
- value);
- return rc;
+ if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL)
+ return 0;
+ return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
}
+/**
+ * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender
+ * Arguments are analogous to ccid_hc_tx_parse_options()
+ */
static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value)
+ u8 pkt, u8 opt, u8 *val, u8 len)
{
- int rc = 0;
- if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL)
- rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value);
- return rc;
+ if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL)
+ return 0;
+ return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
}
static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index 8408398cd44..0581143cb80 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -47,37 +47,6 @@ config IP_DCCP_CCID3_DEBUG
If in doubt, say N.
-config IP_DCCP_CCID3_RTO
- int "Use higher bound for nofeedback timer"
- default 100
- depends on IP_DCCP_CCID3 && EXPERIMENTAL
- ---help---
- Use higher lower bound for nofeedback timer expiration.
-
- The TFRC nofeedback timer normally expires after the maximum of 4
- RTTs and twice the current send interval (RFC 3448, 4.3). On LANs
- with a small RTT this can mean a high processing load and reduced
- performance, since then the nofeedback timer is triggered very
- frequently.
-
- This option enables to set a higher lower bound for the nofeedback
- value. Values in units of milliseconds can be set here.
-
- A value of 0 disables this feature by enforcing the value specified
- in RFC 3448. The following values have been suggested as bounds for
- experimental use:
- * 16-20ms to match the typical multimedia inter-frame interval
- * 100ms as a reasonable compromise [default]
- * 1000ms corresponds to the lower TCP RTO bound (RFC 2988, 2.4)
-
- The default of 100ms is a compromise between a large value for
- efficient DCCP implementations, and a small value to avoid disrupting
- the network in times of congestion.
-
- The purpose of the nofeedback timer is to slow DCCP down when there
- is serious network congestion: experimenting with larger values should
- therefore not be performed on WANs.
-
config IP_DCCP_TFRC_LIB
def_bool y if IP_DCCP_CCID3
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 9b3ae9922be..d850e291f87 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -25,59 +25,14 @@
*/
#include <linux/slab.h>
#include "../feat.h"
-#include "../ccid.h"
-#include "../dccp.h"
#include "ccid2.h"
#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
static int ccid2_debug;
#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a)
-
-static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hc)
-{
- int len = 0;
- int pipe = 0;
- struct ccid2_seq *seqp = hc->tx_seqh;
-
- /* there is data in the chain */
- if (seqp != hc->tx_seqt) {
- seqp = seqp->ccid2s_prev;
- len++;
- if (!seqp->ccid2s_acked)
- pipe++;
-
- while (seqp != hc->tx_seqt) {
- struct ccid2_seq *prev = seqp->ccid2s_prev;
-
- len++;
- if (!prev->ccid2s_acked)
- pipe++;
-
- /* packets are sent sequentially */
- BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq,
- prev->ccid2s_seq ) >= 0);
- BUG_ON(time_before(seqp->ccid2s_sent,
- prev->ccid2s_sent));
-
- seqp = prev;
- }
- }
-
- BUG_ON(pipe != hc->tx_pipe);
- ccid2_pr_debug("len of chain=%d\n", len);
-
- do {
- seqp = seqp->ccid2s_prev;
- len++;
- } while (seqp != hc->tx_seqh);
-
- ccid2_pr_debug("total len=%d\n", len);
- BUG_ON(len != hc->tx_seqbufc * CCID2_SEQBUF_LEN);
-}
#else
#define ccid2_pr_debug(format, a...)
-#define ccid2_hc_tx_check_sanity(hc)
#endif
static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc)
@@ -156,19 +111,10 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
dp->dccps_l_ack_ratio = val;
}
-static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hc, long val)
-{
- ccid2_pr_debug("change SRTT to %ld\n", val);
- hc->tx_srtt = val;
-}
-
-static void ccid2_start_rto_timer(struct sock *sk);
-
static void ccid2_hc_tx_rto_expire(unsigned long data)
{
struct sock *sk = (struct sock *)data;
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- long s;
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
@@ -178,23 +124,19 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
ccid2_pr_debug("RTO_EXPIRE\n");
- ccid2_hc_tx_check_sanity(hc);
-
/* back-off timer */
hc->tx_rto <<= 1;
+ if (hc->tx_rto > DCCP_RTO_MAX)
+ hc->tx_rto = DCCP_RTO_MAX;
- s = hc->tx_rto / HZ;
- if (s > 60)
- hc->tx_rto = 60 * HZ;
-
- ccid2_start_rto_timer(sk);
+ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
/* adjust pipe, cwnd etc */
hc->tx_ssthresh = hc->tx_cwnd / 2;
if (hc->tx_ssthresh < 2)
hc->tx_ssthresh = 2;
- hc->tx_cwnd = 1;
- hc->tx_pipe = 0;
+ hc->tx_cwnd = 1;
+ hc->tx_pipe = 0;
/* clear state about stuff we sent */
hc->tx_seqt = hc->tx_seqh;
@@ -204,23 +146,12 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
hc->tx_rpseq = 0;
hc->tx_rpdupack = -1;
ccid2_change_l_ack_ratio(sk, 1);
- ccid2_hc_tx_check_sanity(hc);
out:
bh_unlock_sock(sk);
sock_put(sk);
}
-static void ccid2_start_rto_timer(struct sock *sk)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
-
- ccid2_pr_debug("setting RTO timeout=%ld\n", hc->tx_rto);
-
- BUG_ON(timer_pending(&hc->tx_rtotimer));
- sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
-}
-
-static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
+static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
{
struct dccp_sock *dp = dccp_sk(sk);
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
@@ -230,7 +161,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
hc->tx_seqh->ccid2s_seq = dp->dccps_gss;
hc->tx_seqh->ccid2s_acked = 0;
- hc->tx_seqh->ccid2s_sent = jiffies;
+ hc->tx_seqh->ccid2s_sent = ccid2_time_stamp;
next = hc->tx_seqh->ccid2s_next;
/* check if we need to alloc more space */
@@ -296,23 +227,20 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
}
#endif
- /* setup RTO timer */
- if (!timer_pending(&hc->tx_rtotimer))
- ccid2_start_rto_timer(sk);
+ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
do {
struct ccid2_seq *seqp = hc->tx_seqt;
while (seqp != hc->tx_seqh) {
- ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n",
+ ccid2_pr_debug("out seq=%llu acked=%d time=%u\n",
(unsigned long long)seqp->ccid2s_seq,
seqp->ccid2s_acked, seqp->ccid2s_sent);
seqp = seqp->ccid2s_next;
}
} while (0);
ccid2_pr_debug("=========\n");
- ccid2_hc_tx_check_sanity(hc);
#endif
}
@@ -378,17 +306,87 @@ out_invalid_option:
return -1;
}
-static void ccid2_hc_tx_kill_rto_timer(struct sock *sk)
+/**
+ * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
+ * This code is almost identical with TCP's tcp_rtt_estimator(), since
+ * - it has a higher sampling frequency (recommended by RFC 1323),
+ * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
+ * - it is simple (cf. more complex proposals such as Eifel timer or research
+ * which suggests that the gain should be set according to window size),
+ * - in tests it was found to work well with CCID2 [gerrit].
+ */
+static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
{
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ long m = mrtt ? : 1;
- sk_stop_timer(sk, &hc->tx_rtotimer);
- ccid2_pr_debug("deleted RTO timer\n");
+ if (hc->tx_srtt == 0) {
+ /* First measurement m */
+ hc->tx_srtt = m << 3;
+ hc->tx_mdev = m << 1;
+
+ hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk));
+ hc->tx_rttvar = hc->tx_mdev_max;
+
+ hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
+ } else {
+ /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
+ m -= (hc->tx_srtt >> 3);
+ hc->tx_srtt += m;
+
+ /* Similarly, update scaled mdev with regard to |m| */
+ if (m < 0) {
+ m = -m;
+ m -= (hc->tx_mdev >> 2);
+ /*
+ * This neutralises RTO increase when RTT < SRTT - mdev
+ * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
+ * in Linux TCP", USENIX 2002, pp. 49-62).
+ */
+ if (m > 0)
+ m >>= 3;
+ } else {
+ m -= (hc->tx_mdev >> 2);
+ }
+ hc->tx_mdev += m;
+
+ if (hc->tx_mdev > hc->tx_mdev_max) {
+ hc->tx_mdev_max = hc->tx_mdev;
+ if (hc->tx_mdev_max > hc->tx_rttvar)
+ hc->tx_rttvar = hc->tx_mdev_max;
+ }
+
+ /*
+ * Decay RTTVAR at most once per flight, exploiting that
+ * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2)
+ * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1)
+ * GAR is a useful bound for FlightSize = pipe.
+ * AWL is probably too low here, as it over-estimates pipe.
+ */
+ if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) {
+ if (hc->tx_mdev_max < hc->tx_rttvar)
+ hc->tx_rttvar -= (hc->tx_rttvar -
+ hc->tx_mdev_max) >> 2;
+ hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
+ hc->tx_mdev_max = tcp_rto_min(sk);
+ }
+ }
+
+ /*
+ * Set RTO from SRTT and RTTVAR
+ * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms.
+ * This agrees with RFC 4341, 5:
+ * "Because DCCP does not retransmit data, DCCP does not require
+ * TCP's recommended minimum timeout of one second".
+ */
+ hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar;
+
+ if (hc->tx_rto > DCCP_RTO_MAX)
+ hc->tx_rto = DCCP_RTO_MAX;
}
-static inline void ccid2_new_ack(struct sock *sk,
- struct ccid2_seq *seqp,
- unsigned int *maxincr)
+static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
+ unsigned int *maxincr)
{
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
@@ -402,93 +400,27 @@ static inline void ccid2_new_ack(struct sock *sk,
hc->tx_cwnd += 1;
hc->tx_packets_acked = 0;
}
-
- /* update RTO */
- if (hc->tx_srtt == -1 ||
- time_after(jiffies, hc->tx_lastrtt + hc->tx_srtt)) {
- unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent;
- int s;
-
- /* first measurement */
- if (hc->tx_srtt == -1) {
- ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
- r, jiffies,
- (unsigned long long)seqp->ccid2s_seq);
- ccid2_change_srtt(hc, r);
- hc->tx_rttvar = r >> 1;
- } else {
- /* RTTVAR */
- long tmp = hc->tx_srtt - r;
- long srtt;
-
- if (tmp < 0)
- tmp *= -1;
-
- tmp >>= 2;
- hc->tx_rttvar *= 3;
- hc->tx_rttvar >>= 2;
- hc->tx_rttvar += tmp;
-
- /* SRTT */
- srtt = hc->tx_srtt;
- srtt *= 7;
- srtt >>= 3;
- tmp = r >> 3;
- srtt += tmp;
- ccid2_change_srtt(hc, srtt);
- }
- s = hc->tx_rttvar << 2;
- /* clock granularity is 1 when based on jiffies */
- if (!s)
- s = 1;
- hc->tx_rto = hc->tx_srtt + s;
-
- /* must be at least a second */
- s = hc->tx_rto / HZ;
- /* DCCP doesn't require this [but I like it cuz my code sux] */
-#if 1
- if (s < 1)
- hc->tx_rto = HZ;
-#endif
- /* max 60 seconds */
- if (s > 60)
- hc->tx_rto = HZ * 60;
-
- hc->tx_lastrtt = jiffies;
-
- ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
- hc->tx_srtt, hc->tx_rttvar,
- hc->tx_rto, HZ, r);
- }
-
- /* we got a new ack, so re-start RTO timer */
- ccid2_hc_tx_kill_rto_timer(sk);
- ccid2_start_rto_timer(sk);
-}
-
-static void ccid2_hc_tx_dec_pipe(struct sock *sk)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
-
- if (hc->tx_pipe == 0)
- DCCP_BUG("pipe == 0");
- else
- hc->tx_pipe--;
-
- if (hc->tx_pipe == 0)
- ccid2_hc_tx_kill_rto_timer(sk);
+ /*
+ * FIXME: RTT is sampled several times per acknowledgment (for each
+ * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
+ * This causes the RTT to be over-estimated, since the older entries
+ * in the Ack Vector have earlier sending times.
+ * The cleanest solution is to not use the ccid2s_sent field at all
+ * and instead use DCCP timestamps: requires changes in other places.
+ */
+ ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent);
}
static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
{
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- if (time_before(seqp->ccid2s_sent, hc->tx_last_cong)) {
+ if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) {
ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
return;
}
- hc->tx_last_cong = jiffies;
+ hc->tx_last_cong = ccid2_time_stamp;
hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U;
hc->tx_ssthresh = max(hc->tx_cwnd, 2U);
@@ -510,7 +442,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
int done = 0;
unsigned int maxincr = 0;
- ccid2_hc_tx_check_sanity(hc);
/* check reverse path congestion */
seqno = DCCP_SKB_CB(skb)->dccpd_seq;
@@ -620,7 +551,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
seqp->ccid2s_acked = 1;
ccid2_pr_debug("Got ack for %llu\n",
(unsigned long long)seqp->ccid2s_seq);
- ccid2_hc_tx_dec_pipe(sk);
+ hc->tx_pipe--;
}
if (seqp == hc->tx_seqt) {
done = 1;
@@ -677,7 +608,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
* one ack vector.
*/
ccid2_congestion_event(sk, seqp);
- ccid2_hc_tx_dec_pipe(sk);
+ hc->tx_pipe--;
}
if (seqp == hc->tx_seqt)
break;
@@ -695,7 +626,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
hc->tx_seqt = hc->tx_seqt->ccid2s_next;
}
- ccid2_hc_tx_check_sanity(hc);
+ /* restart RTO timer if not all outstanding data has been acked */
+ if (hc->tx_pipe == 0)
+ sk_stop_timer(sk, &hc->tx_rtotimer);
+ else
+ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
}
static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -707,12 +642,8 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
/* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
hc->tx_ssthresh = ~0U;
- /*
- * RFC 4341, 5: "The cwnd parameter is initialized to at most four
- * packets for new connections, following the rules from [RFC3390]".
- * We need to convert the bytes of RFC3390 into the packets of RFC 4341.
- */
- hc->tx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U);
+ /* Use larger initial windows (RFC 4341, section 5). */
+ hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
/* Make sure that Ack Ratio is enabled and within bounds. */
max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2);
@@ -723,15 +654,11 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
if (ccid2_hc_tx_alloc_seq(hc))
return -ENOMEM;
- hc->tx_rto = 3 * HZ;
- ccid2_change_srtt(hc, -1);
- hc->tx_rttvar = -1;
+ hc->tx_rto = DCCP_TIMEOUT_INIT;
hc->tx_rpdupack = -1;
- hc->tx_last_cong = jiffies;
+ hc->tx_last_cong = ccid2_time_stamp;
setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire,
(unsigned long)sk);
-
- ccid2_hc_tx_check_sanity(hc);
return 0;
}
@@ -740,7 +667,7 @@ static void ccid2_hc_tx_exit(struct sock *sk)
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
int i;
- ccid2_hc_tx_kill_rto_timer(sk);
+ sk_stop_timer(sk, &hc->tx_rtotimer);
for (i = 0; i < hc->tx_seqbufc; i++)
kfree(hc->tx_seqbuf[i]);
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index 1ec6a30103b..9731c2dc148 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -18,18 +18,23 @@
#ifndef _DCCP_CCID2_H_
#define _DCCP_CCID2_H_
-#include <linux/dccp.h>
#include <linux/timer.h>
#include <linux/types.h>
#include "../ccid.h"
+#include "../dccp.h"
+
+/*
+ * CCID-2 timestamping faces the same issues as TCP timestamping.
+ * Hence we reuse/share as much of the code as possible.
+ */
+#define ccid2_time_stamp tcp_time_stamp
+
/* NUMDUPACK parameter from RFC 4341, p. 6 */
#define NUMDUPACK 3
-struct sock;
-
struct ccid2_seq {
u64 ccid2s_seq;
- unsigned long ccid2s_sent;
+ u32 ccid2s_sent;
int ccid2s_acked;
struct ccid2_seq *ccid2s_prev;
struct ccid2_seq *ccid2s_next;
@@ -42,7 +47,12 @@ struct ccid2_seq {
* struct ccid2_hc_tx_sock - CCID2 TX half connection
* @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
* @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
- * @tx_lastrtt: time RTT was last measured
+ * @tx_srtt: smoothed RTT estimate, scaled by 2^3
+ * @tx_mdev: smoothed RTT variation, scaled by 2^2
+ * @tx_mdev_max: maximum of @mdev during one flight
+ * @tx_rttvar: moving average/maximum of @mdev_max
+ * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
+ * @tx_rtt_seq: to decay RTTVAR at most once per flight
* @tx_rpseq: last consecutive seqno
* @tx_rpdupack: dupacks since rpseq
*/
@@ -55,14 +65,19 @@ struct ccid2_hc_tx_sock {
int tx_seqbufc;
struct ccid2_seq *tx_seqh;
struct ccid2_seq *tx_seqt;
- long tx_rto;
- long tx_srtt;
- long tx_rttvar;
- unsigned long tx_lastrtt;
+
+ /* RTT measurement: variables/principles are the same as in TCP */
+ u32 tx_srtt,
+ tx_mdev,
+ tx_mdev_max,
+ tx_rttvar,
+ tx_rto;
+ u64 tx_rtt_seq:48;
struct timer_list tx_rtotimer;
+
u64 tx_rpseq;
int tx_rpdupack;
- unsigned long tx_last_cong;
+ u32 tx_last_cong;
u64 tx_high_ack;
};
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 95f75298649..3060a60ed5a 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -54,7 +54,6 @@ static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
[TFRC_SSTATE_NO_SENT] = "NO_SENT",
[TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
[TFRC_SSTATE_FBACK] = "FBACK",
- [TFRC_SSTATE_TERM] = "TERM",
};
return ccid3_state_names[state];
@@ -91,19 +90,16 @@ static inline u64 rfc3390_initial_rate(struct sock *sk)
return scaled_div(w_init << 6, hc->tx_rtt);
}
-/*
- * Recalculate t_ipi and delta (should be called whenever X changes)
+/**
+ * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst
+ * This respects the granularity of X_inst (64 * bytes/second).
*/
static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc)
{
- /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */
hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x);
- /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
- hc->tx_delta = min_t(u32, hc->tx_t_ipi / 2, TFRC_OPSYS_HALF_TIME_GRAN);
-
- ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", hc->tx_t_ipi,
- hc->tx_delta, hc->tx_s, (unsigned)(hc->tx_x >> 6));
+ ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi,
+ hc->tx_s, (unsigned)(hc->tx_x >> 6));
}
static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now)
@@ -211,16 +207,19 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk,
ccid3_tx_state_name(hc->tx_state));
+ /* Ignore and do not restart after leaving the established state */
+ if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
+ goto out;
+
+ /* Reset feedback state to "no feedback received" */
if (hc->tx_state == TFRC_SSTATE_FBACK)
ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
- else if (hc->tx_state != TFRC_SSTATE_NO_FBACK)
- goto out;
/*
* Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
+ * RTO is 0 if and only if no feedback has been received yet.
*/
- if (hc->tx_t_rto == 0 || /* no feedback received yet */
- hc->tx_p == 0) {
+ if (hc->tx_t_rto == 0 || hc->tx_p == 0) {
/* halve send rate directly */
hc->tx_x = max(hc->tx_x / 2,
@@ -256,7 +255,7 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
* Set new timeout for the nofeedback timer.
* See comments in packet_recv() regarding the value of t_RTO.
*/
- if (unlikely(hc->tx_t_rto == 0)) /* no feedback yet */
+ if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */
t_nfb = TFRC_INITIAL_TIMEOUT;
else
t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
@@ -290,8 +289,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
if (unlikely(skb->len == 0))
return -EBADMSG;
- switch (hc->tx_state) {
- case TFRC_SSTATE_NO_SENT:
+ if (hc->tx_state == TFRC_SSTATE_NO_SENT) {
sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies +
usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
hc->tx_last_win_count = 0;
@@ -326,27 +324,22 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
ccid3_update_send_interval(hc);
ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
- break;
- case TFRC_SSTATE_NO_FBACK:
- case TFRC_SSTATE_FBACK:
+
+ } else {
delay = ktime_us_delta(hc->tx_t_nom, now);
ccid3_pr_debug("delay=%ld\n", (long)delay);
/*
- * Scheduling of packet transmissions [RFC 3448, 4.6]
+ * Scheduling of packet transmissions (RFC 5348, 8.3)
*
* if (t_now > t_nom - delta)
* // send the packet now
* else
* // send the packet in (t_nom - t_now) milliseconds.
*/
- if (delay - (s64)hc->tx_delta >= 1000)
- return (u32)delay / 1000L;
+ if (delay >= TFRC_T_DELTA)
+ return (u32)delay / USEC_PER_MSEC;
ccid3_hc_tx_update_win_count(hc, now);
- break;
- case TFRC_SSTATE_TERM:
- DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
- return -EINVAL;
}
/* prepare to send now (add options etc.) */
@@ -358,8 +351,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
return 0;
}
-static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
- unsigned int len)
+static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
{
struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
@@ -372,48 +364,34 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- struct ccid3_options_received *opt_recv;
+ struct tfrc_tx_hist_entry *acked;
ktime_t now;
unsigned long t_nfb;
- u32 pinv, r_sample;
+ u32 r_sample;
/* we are only interested in ACKs */
if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
return;
- /* ... and only in the established state */
- if (hc->tx_state != TFRC_SSTATE_FBACK &&
- hc->tx_state != TFRC_SSTATE_NO_FBACK)
- return;
-
- opt_recv = &hc->tx_options_received;
- now = ktime_get_real();
-
- /* Estimate RTT from history if ACK number is valid */
- r_sample = tfrc_tx_hist_rtt(hc->tx_hist,
- DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
- if (r_sample == 0) {
- DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
- dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
- (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
- return;
- }
-
- /* Update receive rate in units of 64 * bytes/second */
- hc->tx_x_recv = opt_recv->ccid3or_receive_rate;
- hc->tx_x_recv <<= 6;
-
- /* Update loss event rate (which is scaled by 1e6) */
- pinv = opt_recv->ccid3or_loss_event_rate;
- if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */
- hc->tx_p = 0;
- else /* can not exceed 100% */
- hc->tx_p = scaled_div(1, pinv);
/*
- * Validate new RTT sample and update moving average
+ * Locate the acknowledged packet in the TX history.
+ *
+ * Returning "entry not found" here can for instance happen when
+ * - the host has not sent out anything (e.g. a passive server),
+ * - the Ack is outdated (packet with higher Ack number was received),
+ * - it is a bogus Ack (for a packet not sent on this connection).
*/
- r_sample = dccp_sample_rtt(sk, r_sample);
+ acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb));
+ if (acked == NULL)
+ return;
+ /* For the sake of RTT sampling, ignore/remove all older entries */
+ tfrc_tx_hist_purge(&acked->next);
+
+ /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
+ now = ktime_get_real();
+ r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9);
+
/*
* Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
*/
@@ -461,13 +439,12 @@ done_computing_x:
sk->sk_write_space(sk);
/*
- * Update timeout interval for the nofeedback timer.
- * We use a configuration option to increase the lower bound.
- * This can help avoid triggering the nofeedback timer too
- * often ('spinning') on LANs with small RTTs.
+ * Update timeout interval for the nofeedback timer. In order to control
+ * rate halving on networks with very low RTTs (<= 1 ms), use per-route
+ * tunable RTAX_RTO_MIN value as the lower bound.
*/
- hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, (CONFIG_IP_DCCP_CCID3_RTO *
- (USEC_PER_SEC / 1000)));
+ hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt,
+ USEC_PER_SEC/HZ * tcp_rto_min(sk));
/*
* Schedule no feedback timer to expire in
* max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
@@ -482,66 +459,41 @@ done_computing_x:
jiffies + usecs_to_jiffies(t_nfb));
}
-static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
- unsigned char len, u16 idx,
- unsigned char *value)
+static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
+ u8 option, u8 *optval, u8 optlen)
{
- int rc = 0;
- const struct dccp_sock *dp = dccp_sk(sk);
struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- struct ccid3_options_received *opt_recv;
__be32 opt_val;
- opt_recv = &hc->tx_options_received;
-
- if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
- opt_recv->ccid3or_seqno = dp->dccps_gsr;
- opt_recv->ccid3or_loss_event_rate = ~0;
- opt_recv->ccid3or_loss_intervals_idx = 0;
- opt_recv->ccid3or_loss_intervals_len = 0;
- opt_recv->ccid3or_receive_rate = 0;
- }
-
switch (option) {
+ case TFRC_OPT_RECEIVE_RATE:
case TFRC_OPT_LOSS_EVENT_RATE:
- if (unlikely(len != 4)) {
- DCCP_WARN("%s(%p), invalid len %d "
- "for TFRC_OPT_LOSS_EVENT_RATE\n",
- dccp_role(sk), sk, len);
- rc = -EINVAL;
- } else {
- opt_val = get_unaligned((__be32 *)value);
- opt_recv->ccid3or_loss_event_rate = ntohl(opt_val);
- ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
- dccp_role(sk), sk,
- opt_recv->ccid3or_loss_event_rate);
+ /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
+ if (packet_type == DCCP_PKT_DATA)
+ break;
+ if (unlikely(optlen != 4)) {
+ DCCP_WARN("%s(%p), invalid len %d for %u\n",
+ dccp_role(sk), sk, optlen, option);
+ return -EINVAL;
}
- break;
- case TFRC_OPT_LOSS_INTERVALS:
- opt_recv->ccid3or_loss_intervals_idx = idx;
- opt_recv->ccid3or_loss_intervals_len = len;
- ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n",
- dccp_role(sk), sk,
- opt_recv->ccid3or_loss_intervals_idx,
- opt_recv->ccid3or_loss_intervals_len);
- break;
- case TFRC_OPT_RECEIVE_RATE:
- if (unlikely(len != 4)) {
- DCCP_WARN("%s(%p), invalid len %d "
- "for TFRC_OPT_RECEIVE_RATE\n",
- dccp_role(sk), sk, len);
- rc = -EINVAL;
- } else {
- opt_val = get_unaligned((__be32 *)value);
- opt_recv->ccid3or_receive_rate = ntohl(opt_val);
+ opt_val = ntohl(get_unaligned((__be32 *)optval));
+
+ if (option == TFRC_OPT_RECEIVE_RATE) {
+ /* Receive Rate is kept in units of 64 bytes/second */
+ hc->tx_x_recv = opt_val;
+ hc->tx_x_recv <<= 6;
+
ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
- dccp_role(sk), sk,
- opt_recv->ccid3or_receive_rate);
+ dccp_role(sk), sk, opt_val);
+ } else {
+ /* Update the fixpoint Loss Event Rate fraction */
+ hc->tx_p = tfrc_invert_loss_event_rate(opt_val);
+
+ ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
+ dccp_role(sk), sk, opt_val);
}
- break;
}
-
- return rc;
+ return 0;
}
static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -559,42 +511,36 @@ static void ccid3_hc_tx_exit(struct sock *sk)
{
struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
sk_stop_timer(sk, &hc->tx_no_feedback_timer);
-
tfrc_tx_hist_purge(&hc->tx_hist);
}
static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
{
- struct ccid3_hc_tx_sock *hc;
-
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return;
-
- hc = ccid3_hc_tx_sk(sk);
- info->tcpi_rto = hc->tx_t_rto;
- info->tcpi_rtt = hc->tx_rtt;
+ info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto;
+ info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt;
}
static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
u32 __user *optval, int __user *optlen)
{
- const struct ccid3_hc_tx_sock *hc;
+ const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ struct tfrc_tx_info tfrc;
const void *val;
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return -EINVAL;
-
- hc = ccid3_hc_tx_sk(sk);
switch (optname) {
case DCCP_SOCKOPT_CCID_TX_INFO:
- if (len < sizeof(hc->tx_tfrc))
+ if (len < sizeof(tfrc))
return -EINVAL;
- len = sizeof(hc->tx_tfrc);
- val = &hc->tx_tfrc;
+ tfrc.tfrctx_x = hc->tx_x;
+ tfrc.tfrctx_x_recv = hc->tx_x_recv;
+ tfrc.tfrctx_x_calc = hc->tx_x_calc;
+ tfrc.tfrctx_rtt = hc->tx_rtt;
+ tfrc.tfrctx_p = hc->tx_p;
+ tfrc.tfrctx_rto = hc->tx_t_rto;
+ tfrc.tfrctx_ipi = hc->tx_t_ipi;
+ len = sizeof(tfrc);
+ val = &tfrc;
break;
default:
return -ENOPROTOOPT;
@@ -624,7 +570,6 @@ static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
static const char *const ccid3_rx_state_names[] = {
[TFRC_RSTATE_NO_DATA] = "NO_DATA",
[TFRC_RSTATE_DATA] = "DATA",
- [TFRC_RSTATE_TERM] = "TERM",
};
return ccid3_rx_state_names[state];
@@ -650,14 +595,9 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
{
struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
- ktime_t now;
+ ktime_t now = ktime_get_real();
s64 delta = 0;
- if (unlikely(hc->rx_state == TFRC_RSTATE_TERM))
- return;
-
- now = ktime_get_real();
-
switch (fbtype) {
case CCID3_FBACK_INITIAL:
hc->rx_x_recv = 0;
@@ -701,14 +641,12 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
{
- const struct ccid3_hc_rx_sock *hc;
+ const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
__be32 x_recv, pinv;
if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
return 0;
- hc = ccid3_hc_rx_sk(sk);
-
if (dccp_packet_without_ack(skb))
return 0;
@@ -749,10 +687,11 @@ static u32 ccid3_first_li(struct sock *sk)
x_recv = scaled_div32(hc->rx_bytes_recv, delta);
if (x_recv == 0) { /* would also trigger divide-by-zero */
DCCP_WARN("X_recv==0\n");
- if ((x_recv = hc->rx_x_recv) == 0) {
+ if (hc->rx_x_recv == 0) {
DCCP_BUG("stored value of X_recv is zero");
return ~0U;
}
+ x_recv = hc->rx_x_recv;
}
fval = scaled_div(hc->rx_s, hc->rx_rtt);
@@ -862,46 +801,31 @@ static void ccid3_hc_rx_exit(struct sock *sk)
{
struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
- ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
-
tfrc_rx_hist_purge(&hc->rx_hist);
tfrc_lh_cleanup(&hc->rx_li_hist);
}
static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
{
- const struct ccid3_hc_rx_sock *hc;
-
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return;
-
- hc = ccid3_hc_rx_sk(sk);
- info->tcpi_ca_state = hc->rx_state;
+ info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state;
info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
- info->tcpi_rcv_rtt = hc->rx_rtt;
+ info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt;
}
static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
u32 __user *optval, int __user *optlen)
{
- const struct ccid3_hc_rx_sock *hc;
+ const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
struct tfrc_rx_info rx_info;
const void *val;
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return -EINVAL;
-
- hc = ccid3_hc_rx_sk(sk);
switch (optname) {
case DCCP_SOCKOPT_CCID_RX_INFO:
if (len < sizeof(rx_info))
return -EINVAL;
rx_info.tfrcrx_x_recv = hc->rx_x_recv;
rx_info.tfrcrx_rtt = hc->rx_rtt;
- rx_info.tfrcrx_p = hc->rx_pinv == 0 ? ~0U :
- scaled_div(1, hc->rx_pinv);
+ rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv);
len = sizeof(rx_info);
val = &rx_info;
break;
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 03263577665..1a9933c2967 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -42,35 +42,36 @@
#include "lib/tfrc.h"
#include "../ccid.h"
-/* Two seconds as per RFC 3448 4.2 */
+/* Two seconds as per RFC 5348, 4.2 */
#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
-/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
-#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ))
-
/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
#define TFRC_T_MBI 64
+/*
+ * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are
+ * rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
+ * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
+ * resolution of HZ < 500 means that the error is below one timer tick (t_gran)
+ * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ).
+ */
+#if (HZ >= 500)
+# define TFRC_T_DELTA USEC_PER_MSEC
+#else
+# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ))
+#endif
+
enum ccid3_options {
TFRC_OPT_LOSS_EVENT_RATE = 192,
TFRC_OPT_LOSS_INTERVALS = 193,
TFRC_OPT_RECEIVE_RATE = 194,
};
-struct ccid3_options_received {
- u64 ccid3or_seqno:48,
- ccid3or_loss_intervals_idx:16;
- u16 ccid3or_loss_intervals_len;
- u32 ccid3or_loss_event_rate;
- u32 ccid3or_receive_rate;
-};
-
/* TFRC sender states */
enum ccid3_hc_tx_states {
TFRC_SSTATE_NO_SENT = 1,
TFRC_SSTATE_NO_FBACK,
TFRC_SSTATE_FBACK,
- TFRC_SSTATE_TERM,
};
/**
@@ -90,19 +91,16 @@ enum ccid3_hc_tx_states {
* @tx_no_feedback_timer: Handle to no feedback timer
* @tx_t_ld: Time last doubled during slow start
* @tx_t_nom: Nominal send time of next packet
- * @tx_delta: Send timer delta (RFC 3448, 4.6) in usecs
* @tx_hist: Packet history
- * @tx_options_received: Parsed set of retrieved options
*/
struct ccid3_hc_tx_sock {
- struct tfrc_tx_info tx_tfrc;
-#define tx_x tx_tfrc.tfrctx_x
-#define tx_x_recv tx_tfrc.tfrctx_x_recv
-#define tx_x_calc tx_tfrc.tfrctx_x_calc
-#define tx_rtt tx_tfrc.tfrctx_rtt
-#define tx_p tx_tfrc.tfrctx_p
-#define tx_t_rto tx_tfrc.tfrctx_rto
-#define tx_t_ipi tx_tfrc.tfrctx_ipi
+ u64 tx_x;
+ u64 tx_x_recv;
+ u32 tx_x_calc;
+ u32 tx_rtt;
+ u32 tx_p;
+ u32 tx_t_rto;
+ u32 tx_t_ipi;
u16 tx_s;
enum ccid3_hc_tx_states tx_state:8;
u8 tx_last_win_count;
@@ -110,9 +108,7 @@ struct ccid3_hc_tx_sock {
struct timer_list tx_no_feedback_timer;
ktime_t tx_t_ld;
ktime_t tx_t_nom;
- u32 tx_delta;
struct tfrc_tx_hist_entry *tx_hist;
- struct ccid3_options_received tx_options_received;
};
static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
@@ -126,21 +122,16 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
enum ccid3_hc_rx_states {
TFRC_RSTATE_NO_DATA = 1,
TFRC_RSTATE_DATA,
- TFRC_RSTATE_TERM = 127,
};
/**
* struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
- * @rx_x_recv: Receiver estimate of send rate (RFC 3448 4.3)
- * @rx_rtt: Receiver estimate of rtt (non-standard)
- * @rx_p: Current loss event rate (RFC 3448 5.4)
* @rx_last_counter: Tracks window counter (RFC 4342, 8.1)
* @rx_state: Receiver state, one of %ccid3_hc_rx_states
* @rx_bytes_recv: Total sum of DCCP payload bytes
* @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3)
* @rx_rtt: Receiver estimate of RTT
* @rx_tstamp_last_feedback: Time at which last feedback was sent
- * @rx_tstamp_last_ack: Time at which last feedback was sent
* @rx_hist: Packet history (loss detection + RTT sampling)
* @rx_li_hist: Loss Interval database
* @rx_s: Received packet size in bytes
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index 8fc3cbf7907..497723c4d4b 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -116,7 +116,7 @@ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
cur->li_length = len;
tfrc_lh_calc_i_mean(lh);
- return (lh->i_mean < old_i_mean);
+ return lh->i_mean < old_i_mean;
}
/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 3a4f414e94a..de8fe294bf0 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -38,18 +38,6 @@
#include "packet_history.h"
#include "../../dccp.h"
-/**
- * tfrc_tx_hist_entry - Simple singly-linked TX history list
- * @next: next oldest entry (LIFO order)
- * @seqno: sequence number of this entry
- * @stamp: send time of packet with sequence number @seqno
- */
-struct tfrc_tx_hist_entry {
- struct tfrc_tx_hist_entry *next;
- u64 seqno;
- ktime_t stamp;
-};
-
/*
* Transmitter History Routines
*/
@@ -71,15 +59,6 @@ void tfrc_tx_packet_history_exit(void)
}
}
-static struct tfrc_tx_hist_entry *
- tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
-{
- while (head != NULL && head->seqno != seqno)
- head = head->next;
-
- return head;
-}
-
int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
{
struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
@@ -107,24 +86,6 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
*headp = NULL;
}
-u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno,
- const ktime_t now)
-{
- u32 rtt = 0;
- struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno);
-
- if (packet != NULL) {
- rtt = ktime_us_delta(now, packet->stamp);
- /*
- * Garbage-collect older (irrelevant) entries:
- */
- tfrc_tx_hist_purge(&packet->next);
- }
-
- return rtt;
-}
-
-
/*
* Receiver History Routines
*/
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index 7df6c529999..7ee4a9d9d33 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -40,12 +40,28 @@
#include <linux/slab.h>
#include "tfrc.h"
-struct tfrc_tx_hist_entry;
+/**
+ * tfrc_tx_hist_entry - Simple singly-linked TX history list
+ * @next: next oldest entry (LIFO order)
+ * @seqno: sequence number of this entry
+ * @stamp: send time of packet with sequence number @seqno
+ */
+struct tfrc_tx_hist_entry {
+ struct tfrc_tx_hist_entry *next;
+ u64 seqno;
+ ktime_t stamp;
+};
+
+static inline struct tfrc_tx_hist_entry *
+ tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
+{
+ while (head != NULL && head->seqno != seqno)
+ head = head->next;
+ return head;
+}
extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
-extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head,
- const u64 seqno, const ktime_t now);
/* Subtraction a-b modulo-16, respects circular wrap-around */
#define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index 01bb48e96c2..f8ee3f54977 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -57,6 +57,7 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
+extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate);
extern int tfrc_tx_packet_history_init(void);
extern void tfrc_tx_packet_history_exit(void);
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index 22ca1cf0eb5..a052a4377e2 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -687,3 +687,17 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
index = tfrc_binsearch(fvalue, 0);
return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
}
+
+/**
+ * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100%
+ * When @loss_event_rate is large, there is a chance that p is truncated to 0.
+ * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
+ */
+u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
+{
+ if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */
+ return 0;
+ if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */
+ return 1000000;
+ return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
+}
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 3ccef1b70fe..3eb264b6082 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -153,18 +153,27 @@ static inline u64 max48(const u64 seq1, const u64 seq2)
}
/**
- * dccp_loss_free - Evaluates condition for data loss from RFC 4340, 7.7.1
- * @s1: start sequence number
- * @s2: end sequence number
+ * dccp_loss_count - Approximate the number of lost data packets in a burst loss
+ * @s1: last known sequence number before the loss ('hole')
+ * @s2: first sequence number seen after the 'hole'
* @ndp: NDP count on packet with sequence number @s2
- * Returns true if the sequence range s1...s2 has no data loss.
*/
-static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp)
+static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp)
{
s64 delta = dccp_delta_seqno(s1, s2);
WARN_ON(delta < 0);
- return (u64)delta <= ndp + 1;
+ delta -= ndp + 1;
+
+ return delta > 0 ? delta : 0;
+}
+
+/**
+ * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1
+ */
+static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp)
+{
+ return dccp_loss_count(s1, s2, ndp) == 0;
}
enum {
@@ -246,7 +255,6 @@ static inline void dccp_clear_xmit_timers(struct sock *sk)
extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
extern const char *dccp_packet_name(const int type);
-extern const char *dccp_state_name(const int state);
extern void dccp_set_state(struct sock *sk, const int state);
extern void dccp_done(struct sock *sk);
@@ -415,6 +423,23 @@ static inline void dccp_update_gsr(struct sock *sk, u64 seq)
dp->dccps_gsr = seq;
/* Sequence validity window depends on remote Sequence Window (7.5.1) */
dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
+ /*
+ * Adjust SWL so that it is not below ISR. In contrast to RFC 4340,
+ * 7.5.1 we perform this check beyond the initial handshake: W/W' are
+ * always > 32, so for the first W/W' packets in the lifetime of a
+ * connection we always have to adjust SWL.
+ * A second reason why we are doing this is that the window depends on
+ * the feature-remote value of Sequence Window: nothing stops the peer
+ * from updating this value while we are busy adjusting SWL for the
+ * first W packets (we would have to count from scratch again then).
+ * Therefore it is safer to always make sure that the Sequence Window
+ * is not artificially extended by a peer who grows SWL downwards by
+ * continually updating the feature-remote Sequence-Window.
+ * If sequence numbers wrap it is bad luck. But that will take a while
+ * (48 bit), and this measure prevents Sequence-number attacks.
+ */
+ if (before48(dp->dccps_swl, dp->dccps_isr))
+ dp->dccps_swl = dp->dccps_isr;
dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
}
@@ -425,14 +450,16 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq)
dp->dccps_gss = seq;
/* Ack validity window depends on local Sequence Window value (7.5.1) */
dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
+ /* Adjust AWL so that it is not below ISS - see comment above for SWL */
+ if (before48(dp->dccps_awl, dp->dccps_iss))
+ dp->dccps_awl = dp->dccps_iss;
dp->dccps_awh = dp->dccps_gss;
}
static inline int dccp_ack_pending(const struct sock *sk)
{
const struct dccp_sock *dp = dccp_sk(sk);
- return dp->dccps_timestamp_echo != 0 ||
- (dp->dccps_hc_rx_ackvec != NULL &&
+ return (dp->dccps_hc_rx_ackvec != NULL &&
dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
inet_csk_ack_scheduled(sk);
}
@@ -449,7 +476,6 @@ extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*);
extern int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed);
extern u32 dccp_timestamp(void);
extern void dccp_timestamping_init(void);
-extern int dccp_insert_option_timestamp(struct sk_buff *skb);
extern int dccp_insert_option(struct sk_buff *skb, unsigned char option,
const void *value, unsigned char len);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index df7dd26cf07..568def95272 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -730,16 +730,6 @@ int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
0, list, len);
}
-/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */
-int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val)
-{
- /* any changes must be registered before establishing the connection */
- if (sk->sk_state != DCCP_CLOSED)
- return -EISCONN;
- if (dccp_feat_type(feat) != FEAT_NN)
- return -EINVAL;
- return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val);
-}
/*
* Tracking features whose value depend on the choice of CCID
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index f96721619de..e56a4e5e634 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -111,7 +111,6 @@ extern int dccp_feat_init(struct sock *sk);
extern void dccp_feat_initialise_sysctls(void);
extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
u8 const *list, u8 len);
-extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
extern int dccp_feat_clone_list(struct list_head const *, struct list_head *);
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 10c957a88f4..265985370fa 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -259,7 +259,7 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
sysctl_dccp_sync_ratelimit)))
return 0;
- DCCP_WARN("DCCP: Step 6 failed for %s packet, "
+ DCCP_WARN("Step 6 failed for %s packet, "
"(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
"(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
"sending SYNC...\n", dccp_packet_name(dh->dccph_type),
@@ -441,20 +441,14 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
kfree_skb(sk->sk_send_head);
sk->sk_send_head = NULL;
- dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
- dccp_update_gsr(sk, dp->dccps_isr);
/*
- * SWL and AWL are initially adjusted so that they are not less than
- * the initial Sequence Numbers received and sent, respectively:
- * SWL := max(GSR + 1 - floor(W/4), ISR),
- * AWL := max(GSS - W' + 1, ISS).
- * These adjustments MUST be applied only at the beginning of the
- * connection.
- *
- * AWL was adjusted in dccp_v4_connect -acme
+ * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect
+ * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH
+ * is done as part of activating the feature values below, since
+ * these settings depend on the local/remote Sequence Window
+ * features, which were undefined or not confirmed until now.
*/
- dccp_set_seqno(&dp->dccps_swl,
- max48(dp->dccps_swl, dp->dccps_isr));
+ dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index d4a166f0f39..3f69ea11482 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -392,7 +392,7 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
newsk = dccp_create_openreq_child(sk, req, skb);
if (newsk == NULL)
- goto exit;
+ goto exit_nonewsk;
sk_setup_caps(newsk, dst);
@@ -409,16 +409,20 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
dccp_sync_mss(newsk, dst_mtu(dst));
+ if (__inet_inherit_port(sk, newsk) < 0) {
+ sock_put(newsk);
+ goto exit;
+ }
__inet_hash_nolisten(newsk, NULL);
- __inet_inherit_port(sk, newsk);
return newsk;
exit_overflow:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
+ dst_release(dst);
exit:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
- dst_release(dst);
return NULL;
}
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 6e3f32575df..dca711df9b6 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -564,7 +564,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
newsk = dccp_create_openreq_child(sk, req, skb);
if (newsk == NULL)
- goto out;
+ goto out_nonewsk;
/*
* No need to charge this sock to the relevant IPv6 refcnt debug socks
@@ -632,18 +632,22 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
+ if (__inet_inherit_port(sk, newsk) < 0) {
+ sock_put(newsk);
+ goto out;
+ }
__inet6_hash(newsk, NULL);
- __inet_inherit_port(sk, newsk);
return newsk;
out_overflow:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+out_nonewsk:
+ dst_release(dst);
out:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
if (opt != NULL && opt != np->opt)
sock_kfree_s(sk, opt, opt->tot_len);
- dst_release(dst);
return NULL;
}
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 128b089d3ae..d7041a0963a 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -121,30 +121,18 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
*
* Choose S.ISS (initial seqno) or set from Init Cookies
* Initialize S.GAR := S.ISS
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
- */
- newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss;
- dccp_update_gss(newsk, dreq->dreq_iss);
-
- newdp->dccps_isr = dreq->dreq_isr;
- dccp_update_gsr(newsk, dreq->dreq_isr);
-
- /*
- * SWL and AWL are initially adjusted so that they are not less than
- * the initial Sequence Numbers received and sent, respectively:
- * SWL := max(GSR + 1 - floor(W/4), ISR),
- * AWL := max(GSS - W' + 1, ISS).
- * These adjustments MUST be applied only at the beginning of the
- * connection.
+ * Set S.ISR, S.GSR from packet (or Init Cookies)
+ *
+ * Setting AWL/AWH and SWL/SWH happens as part of the feature
+ * activation below, as these windows all depend on the local
+ * and remote Sequence Window feature values (7.5.2).
*/
- dccp_set_seqno(&newdp->dccps_swl,
- max48(newdp->dccps_swl, newdp->dccps_isr));
- dccp_set_seqno(&newdp->dccps_awl,
- max48(newdp->dccps_awl, newdp->dccps_iss));
+ newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss;
+ newdp->dccps_gar = newdp->dccps_iss;
+ newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr;
/*
- * Activate features after initialising the sequence numbers,
- * since CCID initialisation may depend on GSS, ISR, ISS etc.
+ * Activate features: initialise CCIDs, sequence windows etc.
*/
if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
/* It is still raw copy of parent, so invalidate
diff --git a/net/dccp/options.c b/net/dccp/options.c
index bfda087bd90..cd306181300 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -96,18 +96,11 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
}
/*
- * CCID-Specific Options (from RFC 4340, sec. 10.3):
- *
- * Option numbers 128 through 191 are for options sent from the
- * HC-Sender to the HC-Receiver; option numbers 192 through 255
- * are for options sent from the HC-Receiver to the HC-Sender.
- *
* CCID-specific options are ignored during connection setup, as
* negotiation may still be in progress (see RFC 4340, 10.3).
* The same applies to Ack Vectors, as these depend on the CCID.
- *
*/
- if (dreq != NULL && (opt >= 128 ||
+ if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC ||
opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
goto ignore_option;
@@ -170,6 +163,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
dccp_role(sk), ntohl(opt_val),
(unsigned long long)
DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ /* schedule an Ack in case this sender is quiescent */
+ inet_csk_schedule_ack(sk);
break;
case DCCPO_TIMESTAMP_ECHO:
if (len != 4 && len != 6 && len != 8)
@@ -226,23 +221,15 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
dccp_role(sk), elapsed_time);
break;
- case 128 ... 191: {
- const u16 idx = value - options;
-
+ case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC:
if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
- opt, len, idx,
- value) != 0)
+ pkt_type, opt, value, len))
goto out_invalid_option;
- }
break;
- case 192 ... 255: {
- const u16 idx = value - options;
-
+ case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
- opt, len, idx,
- value) != 0)
+ pkt_type, opt, value, len))
goto out_invalid_option;
- }
break;
default:
DCCP_CRIT("DCCP(%p): option %d(len=%d) not "
@@ -384,7 +371,7 @@ int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed_time)
EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time);
-int dccp_insert_option_timestamp(struct sk_buff *skb)
+static int dccp_insert_option_timestamp(struct sk_buff *skb)
{
__be32 now = htonl(dccp_timestamp());
/* yes this will overflow but that is the point as we want a
@@ -393,8 +380,6 @@ int dccp_insert_option_timestamp(struct sk_buff *skb)
return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now));
}
-EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp);
-
static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
struct dccp_request_sock *dreq,
struct sk_buff *skb)
diff --git a/net/dccp/output.c b/net/dccp/output.c
index aadbdb58758..a988fe9ffcb 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -304,7 +304,7 @@ void dccp_write_xmit(struct sock *sk, int block)
dcb->dccpd_type = DCCP_PKT_DATA;
err = dccp_transmit_skb(sk, skb);
- ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
+ ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
if (err)
DCCP_BUG("err=%d after ccid_hc_tx_packet_sent",
err);
@@ -474,8 +474,9 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
/*
* Do all connect socket setups that can be done AF independent.
*/
-static inline void dccp_connect_init(struct sock *sk)
+int dccp_connect(struct sock *sk)
{
+ struct sk_buff *skb;
struct dccp_sock *dp = dccp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -485,22 +486,12 @@ static inline void dccp_connect_init(struct sock *sk)
dccp_sync_mss(sk, dst_mtu(dst));
- /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
- dp->dccps_gar = dp->dccps_iss;
-
- icsk->icsk_retransmits = 0;
-}
-
-int dccp_connect(struct sock *sk)
-{
- struct sk_buff *skb;
- struct inet_connection_sock *icsk = inet_csk(sk);
-
/* do not connect if feature negotiation setup fails */
if (dccp_feat_finalise_settings(dccp_sk(sk)))
return -EPROTO;
- dccp_connect_init(sk);
+ /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
+ dp->dccps_gar = dp->dccps_iss;
skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
if (unlikely(skb == NULL))
@@ -516,6 +507,7 @@ int dccp_connect(struct sock *sk)
DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
/* Timer for repeating the REQUEST until an answer. */
+ icsk->icsk_retransmits = 0;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
icsk->icsk_rto, DCCP_RTO_MAX);
return 0;
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index 078e48d442f..33d0e6297c2 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -149,6 +149,7 @@ static const struct file_operations dccpprobe_fops = {
.owner = THIS_MODULE,
.open = dccpprobe_open,
.read = dccpprobe_read,
+ .llseek = noop_llseek,
};
static __init int dccpprobe_init(void)
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 096250d1323..7e5fc04eb6d 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -50,6 +50,30 @@ EXPORT_SYMBOL_GPL(dccp_hashinfo);
/* the maximum queue length for tx in packets. 0 is no limit */
int sysctl_dccp_tx_qlen __read_mostly = 5;
+#ifdef CONFIG_IP_DCCP_DEBUG
+static const char *dccp_state_name(const int state)
+{
+ static const char *const dccp_state_names[] = {
+ [DCCP_OPEN] = "OPEN",
+ [DCCP_REQUESTING] = "REQUESTING",
+ [DCCP_PARTOPEN] = "PARTOPEN",
+ [DCCP_LISTEN] = "LISTEN",
+ [DCCP_RESPOND] = "RESPOND",
+ [DCCP_CLOSING] = "CLOSING",
+ [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ",
+ [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE",
+ [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ",
+ [DCCP_TIME_WAIT] = "TIME_WAIT",
+ [DCCP_CLOSED] = "CLOSED",
+ };
+
+ if (state >= DCCP_MAX_STATES)
+ return "INVALID STATE!";
+ else
+ return dccp_state_names[state];
+}
+#endif
+
void dccp_set_state(struct sock *sk, const int state)
{
const int oldstate = sk->sk_state;
@@ -146,30 +170,6 @@ const char *dccp_packet_name(const int type)
EXPORT_SYMBOL_GPL(dccp_packet_name);
-const char *dccp_state_name(const int state)
-{
- static const char *const dccp_state_names[] = {
- [DCCP_OPEN] = "OPEN",
- [DCCP_REQUESTING] = "REQUESTING",
- [DCCP_PARTOPEN] = "PARTOPEN",
- [DCCP_LISTEN] = "LISTEN",
- [DCCP_RESPOND] = "RESPOND",
- [DCCP_CLOSING] = "CLOSING",
- [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ",
- [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE",
- [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ",
- [DCCP_TIME_WAIT] = "TIME_WAIT",
- [DCCP_CLOSED] = "CLOSED",
- };
-
- if (state >= DCCP_MAX_STATES)
- return "INVALID STATE!";
- else
- return dccp_state_names[state];
-}
-
-EXPORT_SYMBOL_GPL(dccp_state_name);
-
int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
{
struct dccp_sock *dp = dccp_sk(sk);
@@ -944,7 +944,7 @@ void dccp_close(struct sock *sk, long timeout)
if (data_was_unread) {
/* Unread data was tossed, send an appropriate Reset Code */
- DCCP_WARN("DCCP: ABORT -- %u bytes unread\n", data_was_unread);
+ DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread);
dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
dccp_set_state(sk, DCCP_CLOSED);
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 0363bb95cc7..a085dbcf5c7 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -48,7 +48,6 @@
#include <net/dn_neigh.h>
#include <net/dn_route.h>
-static u32 dn_neigh_hash(const void *pkey, const struct net_device *dev);
static int dn_neigh_construct(struct neighbour *);
static void dn_long_error_report(struct neighbour *, struct sk_buff *);
static void dn_short_error_report(struct neighbour *, struct sk_buff *);
@@ -93,6 +92,13 @@ static const struct neigh_ops dn_phase3_ops = {
.queue_xmit = dev_queue_xmit
};
+static u32 dn_neigh_hash(const void *pkey,
+ const struct net_device *dev,
+ __u32 hash_rnd)
+{
+ return jhash_2words(*(__u16 *)pkey, 0, hash_rnd);
+}
+
struct neigh_table dn_neigh_table = {
.family = PF_DECnet,
.entry_size = sizeof(struct dn_neigh),
@@ -122,11 +128,6 @@ struct neigh_table dn_neigh_table = {
.gc_thresh3 = 1024,
};
-static u32 dn_neigh_hash(const void *pkey, const struct net_device *dev)
-{
- return jhash_2words(*(__u16 *)pkey, 0, dn_neigh_table.hash_rnd);
-}
-
static int dn_neigh_construct(struct neighbour *neigh)
{
struct net_device *dev = neigh->dev;
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index baeb1eaf011..2ef115277be 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -693,22 +693,22 @@ void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg)
aux = scp->accessdata.acc_userl;
*skb_put(skb, 1) = aux;
if (aux > 0)
- memcpy(skb_put(skb, aux), scp->accessdata.acc_user, aux);
+ memcpy(skb_put(skb, aux), scp->accessdata.acc_user, aux);
aux = scp->accessdata.acc_passl;
*skb_put(skb, 1) = aux;
if (aux > 0)
- memcpy(skb_put(skb, aux), scp->accessdata.acc_pass, aux);
+ memcpy(skb_put(skb, aux), scp->accessdata.acc_pass, aux);
aux = scp->accessdata.acc_accl;
*skb_put(skb, 1) = aux;
if (aux > 0)
- memcpy(skb_put(skb, aux), scp->accessdata.acc_acc, aux);
+ memcpy(skb_put(skb, aux), scp->accessdata.acc_acc, aux);
aux = (__u8)le16_to_cpu(scp->conndata_out.opt_optl);
*skb_put(skb, 1) = aux;
if (aux > 0)
- memcpy(skb_put(skb,aux), scp->conndata_out.opt_data, aux);
+ memcpy(skb_put(skb, aux), scp->conndata_out.opt_data, aux);
scp->persist = dn_nsp_persist(sk);
scp->persist_fxn = dn_nsp_retrans_conninit;
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 6585ea6d118..df0f3e54ff8 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -132,7 +132,6 @@ static struct dst_ops dn_dst_ops = {
.negative_advice = dn_dst_negative_advice,
.link_failure = dn_dst_link_failure,
.update_pmtu = dn_dst_update_pmtu,
- .entries = ATOMIC_INIT(0),
};
static __inline__ unsigned dn_hash(__le16 src, __le16 dst)
@@ -1758,6 +1757,7 @@ void __init dn_route_init(void)
dn_dst_ops.kmem_cachep =
kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ dst_entries_init(&dn_dst_ops);
setup_timer(&dn_route_timer, dn_dst_check_expire, 0);
dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ;
add_timer(&dn_route_timer);
@@ -1816,5 +1816,6 @@ void __exit dn_route_cleanup(void)
dn_run_flush(0);
proc_net_remove(&init_net, "decnet_cache");
+ dst_entries_destroy(&dn_dst_ops);
}
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 400a04d5c9a..739435a6af3 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -29,6 +29,7 @@
#include <linux/kernel.h>
#include <linux/keyctl.h>
#include <linux/err.h>
+#include <linux/seq_file.h>
#include <keys/dns_resolver-type.h>
#include <keys/user-type.h>
#include "internal.h"
@@ -43,6 +44,8 @@ MODULE_PARM_DESC(debug, "DNS Resolver debugging mask");
const struct cred *dns_resolver_cache;
+#define DNS_ERRORNO_OPTION "dnserror"
+
/*
* Instantiate a user defined key for dns_resolver.
*
@@ -59,9 +62,10 @@ static int
dns_resolver_instantiate(struct key *key, const void *_data, size_t datalen)
{
struct user_key_payload *upayload;
+ unsigned long derrno;
int ret;
size_t result_len = 0;
- const char *data = _data, *opt;
+ const char *data = _data, *end, *opt;
kenter("%%%d,%s,'%s',%zu",
key->serial, key->description, data, datalen);
@@ -71,13 +75,77 @@ dns_resolver_instantiate(struct key *key, const void *_data, size_t datalen)
datalen--;
/* deal with any options embedded in the data */
+ end = data + datalen;
opt = memchr(data, '#', datalen);
if (!opt) {
- kdebug("no options currently supported");
- return -EINVAL;
+ /* no options: the entire data is the result */
+ kdebug("no options");
+ result_len = datalen;
+ } else {
+ const char *next_opt;
+
+ result_len = opt - data;
+ opt++;
+ kdebug("options: '%s'", opt);
+ do {
+ const char *eq;
+ int opt_len, opt_nlen, opt_vlen, tmp;
+
+ next_opt = memchr(opt, '#', end - opt) ?: end;
+ opt_len = next_opt - opt;
+ if (!opt_len) {
+ printk(KERN_WARNING
+ "Empty option to dns_resolver key %d\n",
+ key->serial);
+ return -EINVAL;
+ }
+
+ eq = memchr(opt, '=', opt_len) ?: end;
+ opt_nlen = eq - opt;
+ eq++;
+ opt_vlen = next_opt - eq; /* will be -1 if no value */
+
+ tmp = opt_vlen >= 0 ? opt_vlen : 0;
+ kdebug("option '%*.*s' val '%*.*s'",
+ opt_nlen, opt_nlen, opt, tmp, tmp, eq);
+
+ /* see if it's an error number representing a DNS error
+ * that's to be recorded as the result in this key */
+ if (opt_nlen == sizeof(DNS_ERRORNO_OPTION) - 1 &&
+ memcmp(opt, DNS_ERRORNO_OPTION, opt_nlen) == 0) {
+ kdebug("dns error number option");
+ if (opt_vlen <= 0)
+ goto bad_option_value;
+
+ ret = strict_strtoul(eq, 10, &derrno);
+ if (ret < 0)
+ goto bad_option_value;
+
+ if (derrno < 1 || derrno > 511)
+ goto bad_option_value;
+
+ kdebug("dns error no. = %lu", derrno);
+ key->type_data.x[0] = -derrno;
+ continue;
+ }
+
+ bad_option_value:
+ printk(KERN_WARNING
+ "Option '%*.*s' to dns_resolver key %d:"
+ " bad/missing value\n",
+ opt_nlen, opt_nlen, opt, key->serial);
+ return -EINVAL;
+ } while (opt = next_opt + 1, opt < end);
+ }
+
+ /* don't cache the result if we're caching an error saying there's no
+ * result */
+ if (key->type_data.x[0]) {
+ kleave(" = 0 [h_error %ld]", key->type_data.x[0]);
+ return 0;
}
- result_len = datalen;
+ kdebug("store result");
ret = key_payload_reserve(key, result_len);
if (ret < 0)
return -EINVAL;
@@ -135,13 +203,27 @@ no_match:
return ret;
}
+/*
+ * Describe a DNS key
+ */
+static void dns_resolver_describe(const struct key *key, struct seq_file *m)
+{
+ int err = key->type_data.x[0];
+
+ seq_puts(m, key->description);
+ if (err)
+ seq_printf(m, ": %d", err);
+ else
+ seq_printf(m, ": %u", key->datalen);
+}
+
struct key_type key_type_dns_resolver = {
.name = "dns_resolver",
.instantiate = dns_resolver_instantiate,
.match = dns_resolver_match,
.revoke = user_revoke,
.destroy = user_destroy,
- .describe = user_describe,
+ .describe = dns_resolver_describe,
.read = user_read,
};
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index 03d5255f5cf..c32be292c7e 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -136,6 +136,11 @@ int dns_query(const char *type, const char *name, size_t namelen,
if (ret < 0)
goto put;
+ /* If the DNS server gave an error, return that to the caller */
+ ret = rkey->type_data.x[0];
+ if (ret)
+ goto put;
+
upayload = rcu_dereference_protected(rkey->payload.data,
lockdep_is_held(&rkey->sem));
len = upayload->datalen;
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 11201784d29..87bb5f4de0e 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -1,7 +1,7 @@
menuconfig NET_DSA
bool "Distributed Switch Architecture support"
default n
- depends on EXPERIMENTAL && NET_ETHERNET && !S390
+ depends on EXPERIMENTAL && NETDEVICES && !S390
select PHYLIB
---help---
This allows you to use hardware switch chips that use
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index dc54bd0d083..f8c1ae4b41f 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -392,7 +392,7 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
dev_queue_xmit(skb);
dev_put(dev);
mutex_unlock(&econet_mutex);
- return(len);
+ return len;
out_free:
kfree_skb(skb);
@@ -637,7 +637,7 @@ static int econet_create(struct net *net, struct socket *sock, int protocol,
eo->num = protocol;
econet_insert_socket(&econet_sklist, sk);
- return(0);
+ return 0;
out:
return err;
}
@@ -1009,7 +1009,6 @@ static int __init aun_udp_initialise(void)
struct sockaddr_in sin;
skb_queue_head_init(&aun_queue);
- spin_lock_init(&aun_queue_lock);
setup_timer(&ab_cleanup_timer, ab_cleanup, 0);
ab_cleanup_timer.expires = jiffies + (HZ*2);
add_timer(&ab_cleanup_timer);
@@ -1167,7 +1166,6 @@ static int __init econet_proto_init(void)
goto out;
sock_register(&econet_family_ops);
#ifdef CONFIG_ECONET_AUNUDP
- spin_lock_init(&aun_queue_lock);
aun_udp_initialise();
#endif
#ifdef CONFIG_ECONET_NATIVE
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 215c83986a9..f00ef2f1d81 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -367,7 +367,7 @@ struct net_device *alloc_etherdev_mq(int sizeof_priv, unsigned int queue_count)
EXPORT_SYMBOL(alloc_etherdev_mq);
static size_t _format_mac_addr(char *buf, int buflen,
- const unsigned char *addr, int len)
+ const unsigned char *addr, int len)
{
int i;
char *cp = buf;
@@ -376,7 +376,7 @@ static size_t _format_mac_addr(char *buf, int buflen,
cp += scnprintf(cp, buflen - (cp - buf), "%02x", addr[i]);
if (i == len - 1)
break;
- cp += strlcpy(cp, ":", buflen - (cp - buf));
+ cp += scnprintf(cp, buflen - (cp - buf), ":");
}
return cp - buf;
}
@@ -386,7 +386,7 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
size_t l;
l = _format_mac_addr(buf, PAGE_SIZE, addr, len);
- l += strlcpy(buf + l, "\n", PAGE_SIZE - l);
- return ((ssize_t) l);
+ l += scnprintf(buf + l, PAGE_SIZE - l, "\n");
+ return (ssize_t)l;
}
EXPORT_SYMBOL(sysfs_format_mac);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 1cc7ef270d5..9e95d7fb6d5 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -46,7 +46,7 @@ config IP_ADVANCED_ROUTER
rp_filter on use:
echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
- and
+ or
echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
Note that some distributions enable it in startup scripts.
@@ -215,8 +215,15 @@ config NET_IPIP
be inserted in and removed from the running kernel whenever you
want). Most people won't need this and can say N.
+config NET_IPGRE_DEMUX
+ tristate "IP: GRE demultiplexer"
+ help
+ This is helper module to demultiplex GRE packets on GRE version field criteria.
+ Required by ip_gre and pptp modules.
+
config NET_IPGRE
tristate "IP: GRE tunnels over IP"
+ depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
@@ -412,7 +419,7 @@ config INET_XFRM_MODE_BEET
If unsure, say Y.
config INET_LRO
- bool "Large Receive Offload (ipv4/tcp)"
+ tristate "Large Receive Offload (ipv4/tcp)"
default y
---help---
Support for Large Receive Offload (ipv4/tcp).
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 80ff87ce43a..4978d22f9a7 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
obj-$(CONFIG_IP_MROUTE) += ipmr.o
obj-$(CONFIG_NET_IPIP) += ipip.o
+obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
obj-$(CONFIG_NET_IPGRE) += ip_gre.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_AH) += ah4.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6a1100c25a9..f581f77d109 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -227,18 +227,16 @@ EXPORT_SYMBOL(inet_ehash_secret);
/*
* inet_ehash_secret must be set exactly once
- * Instead of using a dedicated spinlock, we (ab)use inetsw_lock
*/
void build_ehash_secret(void)
{
u32 rnd;
+
do {
get_random_bytes(&rnd, sizeof(rnd));
} while (rnd == 0);
- spin_lock_bh(&inetsw_lock);
- if (!inet_ehash_secret)
- inet_ehash_secret = rnd;
- spin_unlock_bh(&inetsw_lock);
+
+ cmpxchg(&inet_ehash_secret, 0, rnd);
}
EXPORT_SYMBOL(build_ehash_secret);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 96c1955b3e2..d8e540c5b07 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -55,7 +55,7 @@
* Stuart Cheshire : Metricom and grat arp fixes
* *** FOR 2.1 clean this up ***
* Lawrence V. Stefani: (08/12/96) Added FDDI support.
- * Alan Cox : Took the AP1000 nasty FDDI hack and
+ * Alan Cox : Took the AP1000 nasty FDDI hack and
* folded into the mainstream FDDI code.
* Ack spit, Linus how did you allow that
* one in...
@@ -120,14 +120,14 @@ EXPORT_SYMBOL(clip_tbl_hook);
#endif
#include <asm/system.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/netfilter_arp.h>
/*
* Interface to generic neighbour cache.
*/
-static u32 arp_hash(const void *pkey, const struct net_device *dev);
+static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd);
static int arp_constructor(struct neighbour *neigh);
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -161,7 +161,7 @@ static const struct neigh_ops arp_direct_ops = {
.queue_xmit = dev_queue_xmit,
};
-const struct neigh_ops arp_broken_ops = {
+static const struct neigh_ops arp_broken_ops = {
.family = AF_INET,
.solicit = arp_solicit,
.error_report = arp_error_report,
@@ -170,35 +170,34 @@ const struct neigh_ops arp_broken_ops = {
.hh_output = dev_queue_xmit,
.queue_xmit = dev_queue_xmit,
};
-EXPORT_SYMBOL(arp_broken_ops);
struct neigh_table arp_tbl = {
- .family = AF_INET,
- .entry_size = sizeof(struct neighbour) + 4,
- .key_len = 4,
- .hash = arp_hash,
- .constructor = arp_constructor,
- .proxy_redo = parp_redo,
- .id = "arp_cache",
- .parms = {
- .tbl = &arp_tbl,
- .base_reachable_time = 30 * HZ,
- .retrans_time = 1 * HZ,
- .gc_staletime = 60 * HZ,
- .reachable_time = 30 * HZ,
- .delay_probe_time = 5 * HZ,
- .queue_len = 3,
- .ucast_probes = 3,
- .mcast_probes = 3,
- .anycast_delay = 1 * HZ,
- .proxy_delay = (8 * HZ) / 10,
- .proxy_qlen = 64,
- .locktime = 1 * HZ,
+ .family = AF_INET,
+ .entry_size = sizeof(struct neighbour) + 4,
+ .key_len = 4,
+ .hash = arp_hash,
+ .constructor = arp_constructor,
+ .proxy_redo = parp_redo,
+ .id = "arp_cache",
+ .parms = {
+ .tbl = &arp_tbl,
+ .base_reachable_time = 30 * HZ,
+ .retrans_time = 1 * HZ,
+ .gc_staletime = 60 * HZ,
+ .reachable_time = 30 * HZ,
+ .delay_probe_time = 5 * HZ,
+ .queue_len = 3,
+ .ucast_probes = 3,
+ .mcast_probes = 3,
+ .anycast_delay = 1 * HZ,
+ .proxy_delay = (8 * HZ) / 10,
+ .proxy_qlen = 64,
+ .locktime = 1 * HZ,
},
- .gc_interval = 30 * HZ,
- .gc_thresh1 = 128,
- .gc_thresh2 = 512,
- .gc_thresh3 = 1024,
+ .gc_interval = 30 * HZ,
+ .gc_thresh1 = 128,
+ .gc_thresh2 = 512,
+ .gc_thresh3 = 1024,
};
EXPORT_SYMBOL(arp_tbl);
@@ -226,14 +225,16 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
}
-static u32 arp_hash(const void *pkey, const struct net_device *dev)
+static u32 arp_hash(const void *pkey,
+ const struct net_device *dev,
+ __u32 hash_rnd)
{
- return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd);
+ return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd);
}
static int arp_constructor(struct neighbour *neigh)
{
- __be32 addr = *(__be32*)neigh->primary_key;
+ __be32 addr = *(__be32 *)neigh->primary_key;
struct net_device *dev = neigh->dev;
struct in_device *in_dev;
struct neigh_parms *parms;
@@ -296,16 +297,19 @@ static int arp_constructor(struct neighbour *neigh)
neigh->ops = &arp_broken_ops;
neigh->output = neigh->ops->output;
return 0;
+#else
+ break;
#endif
- ;}
+ }
#endif
if (neigh->type == RTN_MULTICAST) {
neigh->nud_state = NUD_NOARP;
arp_mc_map(addr, neigh->ha, dev, 1);
- } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
+ } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
- } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) {
+ } else if (neigh->type == RTN_BROADCAST ||
+ (dev->flags & IFF_POINTOPOINT)) {
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->broadcast, dev->addr_len);
}
@@ -315,7 +319,7 @@ static int arp_constructor(struct neighbour *neigh)
else
neigh->ops = &arp_generic_ops;
- if (neigh->nud_state&NUD_VALID)
+ if (neigh->nud_state & NUD_VALID)
neigh->output = neigh->ops->connected_output;
else
neigh->output = neigh->ops->output;
@@ -334,7 +338,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
__be32 saddr = 0;
u8 *dst_ha = NULL;
struct net_device *dev = neigh->dev;
- __be32 target = *(__be32*)neigh->primary_key;
+ __be32 target = *(__be32 *)neigh->primary_key;
int probes = atomic_read(&neigh->probes);
struct in_device *in_dev;
@@ -347,7 +351,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
default:
case 0: /* By default announce any local IP */
- if (skb && inet_addr_type(dev_net(dev), ip_hdr(skb)->saddr) == RTN_LOCAL)
+ if (skb && inet_addr_type(dev_net(dev),
+ ip_hdr(skb)->saddr) == RTN_LOCAL)
saddr = ip_hdr(skb)->saddr;
break;
case 1: /* Restrict announcements of saddr in same subnet */
@@ -369,16 +374,21 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
if (!saddr)
saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
- if ((probes -= neigh->parms->ucast_probes) < 0) {
- if (!(neigh->nud_state&NUD_VALID))
- printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
+ probes -= neigh->parms->ucast_probes;
+ if (probes < 0) {
+ if (!(neigh->nud_state & NUD_VALID))
+ printk(KERN_DEBUG
+ "trying to ucast probe in NUD_INVALID\n");
dst_ha = neigh->ha;
read_lock_bh(&neigh->lock);
- } else if ((probes -= neigh->parms->app_probes) < 0) {
+ } else {
+ probes -= neigh->parms->app_probes;
+ if (probes < 0) {
#ifdef CONFIG_ARPD
- neigh_app_ns(neigh);
+ neigh_app_ns(neigh);
#endif
- return;
+ return;
+ }
}
arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
@@ -451,7 +461,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
* is allowed to use this function, it is scheduled to be removed. --ANK
*/
-static int arp_set_predefined(int addr_hint, unsigned char * haddr, __be32 paddr, struct net_device * dev)
+static int arp_set_predefined(int addr_hint, unsigned char *haddr,
+ __be32 paddr, struct net_device *dev)
{
switch (addr_hint) {
case RTN_LOCAL:
@@ -483,17 +494,16 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
paddr = skb_rtable(skb)->rt_gateway;
- if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev))
+ if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
+ paddr, dev))
return 0;
n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
if (n) {
n->used = jiffies;
- if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
- read_lock_bh(&n->lock);
- memcpy(haddr, n->ha, dev->addr_len);
- read_unlock_bh(&n->lock);
+ if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) {
+ neigh_ha_snapshot(haddr, n, dev);
neigh_release(n);
return 0;
}
@@ -515,13 +525,14 @@ int arp_bind_neighbour(struct dst_entry *dst)
return -EINVAL;
if (n == NULL) {
__be32 nexthop = ((struct rtable *)dst)->rt_gateway;
- if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
+ if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
nexthop = 0;
n = __neigh_lookup_errno(
#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
- dev->type == ARPHRD_ATM ? clip_tbl_hook :
+ dev->type == ARPHRD_ATM ?
+ clip_tbl_hook :
#endif
- &arp_tbl, &nexthop, dev);
+ &arp_tbl, &nexthop, dev);
if (IS_ERR(n))
return PTR_ERR(n);
dst->neighbour = n;
@@ -543,8 +554,8 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
if (!IN_DEV_PROXY_ARP(in_dev))
return 0;
-
- if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0)
+ imi = IN_DEV_MEDIUM_ID(in_dev);
+ if (imi == 0)
return 1;
if (imi == -1)
return 0;
@@ -555,7 +566,7 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
if (out_dev)
omi = IN_DEV_MEDIUM_ID(out_dev);
- return (omi != imi && omi != -1);
+ return omi != imi && omi != -1;
}
/*
@@ -685,7 +696,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
arp->ar_pln = 4;
arp->ar_op = htons(type);
- arp_ptr=(unsigned char *)(arp+1);
+ arp_ptr = (unsigned char *)(arp + 1);
memcpy(arp_ptr, src_hw, dev->addr_len);
arp_ptr += dev->addr_len;
@@ -735,9 +746,8 @@ void arp_send(int type, int ptype, __be32 dest_ip,
skb = arp_create(type, ptype, dest_ip, dev, src_ip,
dest_hw, src_hw, target_hw);
- if (skb == NULL) {
+ if (skb == NULL)
return;
- }
arp_xmit(skb);
}
@@ -815,7 +825,7 @@ static int arp_process(struct sk_buff *skb)
/*
* Extract fields
*/
- arp_ptr= (unsigned char *)(arp+1);
+ arp_ptr = (unsigned char *)(arp + 1);
sha = arp_ptr;
arp_ptr += dev->addr_len;
memcpy(&sip, arp_ptr, 4);
@@ -869,16 +879,17 @@ static int arp_process(struct sk_buff *skb)
addr_type = rt->rt_type;
if (addr_type == RTN_LOCAL) {
- int dont_send = 0;
+ int dont_send;
- if (!dont_send)
- dont_send |= arp_ignore(in_dev,sip,tip);
+ dont_send = arp_ignore(in_dev, sip, tip);
if (!dont_send && IN_DEV_ARPFILTER(in_dev))
- dont_send |= arp_filter(sip,tip,dev);
+ dont_send |= arp_filter(sip, tip, dev);
if (!dont_send) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n) {
- arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+ arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+ dev, tip, sha, dev->dev_addr,
+ sha);
neigh_release(n);
}
}
@@ -887,8 +898,7 @@ static int arp_process(struct sk_buff *skb)
if (addr_type == RTN_UNICAST &&
(arp_fwd_proxy(in_dev, dev, rt) ||
arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
- pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))
- {
+ pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n)
neigh_release(n);
@@ -896,9 +906,12 @@ static int arp_process(struct sk_buff *skb)
if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
skb->pkt_type == PACKET_HOST ||
in_dev->arp_parms->proxy_delay == 0) {
- arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+ arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+ dev, tip, sha, dev->dev_addr,
+ sha);
} else {
- pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb);
+ pneigh_enqueue(&arp_tbl,
+ in_dev->arp_parms, skb);
return 0;
}
goto out;
@@ -939,7 +952,8 @@ static int arp_process(struct sk_buff *skb)
if (arp->ar_op != htons(ARPOP_REPLY) ||
skb->pkt_type != PACKET_HOST)
state = NUD_STALE;
- neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0);
+ neigh_update(n, sha, state,
+ override ? NEIGH_UPDATE_F_OVERRIDE : 0);
neigh_release(n);
}
@@ -975,7 +989,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
arp->ar_pln != 4)
goto freeskb;
- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (skb == NULL)
goto out_of_mem;
memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
@@ -1019,7 +1034,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
return -EINVAL;
if (!dev && (r->arp_flags & ATF_COM)) {
dev = dev_getbyhwaddr(net, r->arp_ha.sa_family,
- r->arp_ha.sa_data);
+ r->arp_ha.sa_data);
if (!dev)
return -ENODEV;
}
@@ -1033,7 +1048,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
}
static int arp_req_set(struct net *net, struct arpreq *r,
- struct net_device * dev)
+ struct net_device *dev)
{
__be32 ip;
struct neighbour *neigh;
@@ -1046,10 +1061,11 @@ static int arp_req_set(struct net *net, struct arpreq *r,
if (r->arp_flags & ATF_PERM)
r->arp_flags |= ATF_COM;
if (dev == NULL) {
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
- .tos = RTO_ONLINK } } };
- struct rtable * rt;
- if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
+ struct flowi fl = { .nl_u.ip4_u = { .daddr = ip,
+ .tos = RTO_ONLINK } };
+ struct rtable *rt;
+ err = ip_route_output_key(net, &rt, &fl);
+ if (err != 0)
return err;
dev = rt->dst.dev;
ip_rt_put(rt);
@@ -1083,9 +1099,9 @@ static int arp_req_set(struct net *net, struct arpreq *r,
unsigned state = NUD_STALE;
if (r->arp_flags & ATF_PERM)
state = NUD_PERMANENT;
- err = neigh_update(neigh, (r->arp_flags&ATF_COM) ?
+ err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
r->arp_ha.sa_data : NULL, state,
- NEIGH_UPDATE_F_OVERRIDE|
+ NEIGH_UPDATE_F_OVERRIDE |
NEIGH_UPDATE_F_ADMIN);
neigh_release(neigh);
}
@@ -1094,12 +1110,12 @@ static int arp_req_set(struct net *net, struct arpreq *r,
static unsigned arp_state_to_flags(struct neighbour *neigh)
{
- unsigned flags = 0;
if (neigh->nud_state&NUD_PERMANENT)
- flags = ATF_PERM|ATF_COM;
+ return ATF_PERM | ATF_COM;
else if (neigh->nud_state&NUD_VALID)
- flags = ATF_COM;
- return flags;
+ return ATF_COM;
+ else
+ return 0;
}
/*
@@ -1142,7 +1158,7 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r,
}
static int arp_req_delete(struct net *net, struct arpreq *r,
- struct net_device * dev)
+ struct net_device *dev)
{
int err;
__be32 ip;
@@ -1153,10 +1169,11 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
if (dev == NULL) {
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
- .tos = RTO_ONLINK } } };
- struct rtable * rt;
- if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
+ struct flowi fl = { .nl_u.ip4_u = { .daddr = ip,
+ .tos = RTO_ONLINK } };
+ struct rtable *rt;
+ err = ip_route_output_key(net, &rt, &fl);
+ if (err != 0)
return err;
dev = rt->dst.dev;
ip_rt_put(rt);
@@ -1166,7 +1183,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
err = -ENXIO;
neigh = neigh_lookup(&arp_tbl, &ip, dev);
if (neigh) {
- if (neigh->nud_state&~NUD_NOARP)
+ if (neigh->nud_state & ~NUD_NOARP)
err = neigh_update(neigh, NULL, NUD_FAILED,
NEIGH_UPDATE_F_OVERRIDE|
NEIGH_UPDATE_F_ADMIN);
@@ -1186,24 +1203,24 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
struct net_device *dev = NULL;
switch (cmd) {
- case SIOCDARP:
- case SIOCSARP:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- case SIOCGARP:
- err = copy_from_user(&r, arg, sizeof(struct arpreq));
- if (err)
- return -EFAULT;
- break;
- default:
- return -EINVAL;
+ case SIOCDARP:
+ case SIOCSARP:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ case SIOCGARP:
+ err = copy_from_user(&r, arg, sizeof(struct arpreq));
+ if (err)
+ return -EFAULT;
+ break;
+ default:
+ return -EINVAL;
}
if (r.arp_pa.sa_family != AF_INET)
return -EPFNOSUPPORT;
if (!(r.arp_flags & ATF_PUBL) &&
- (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB)))
+ (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
return -EINVAL;
if (!(r.arp_flags & ATF_NETMASK))
((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
@@ -1211,7 +1228,8 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
rtnl_lock();
if (r.arp_dev[0]) {
err = -ENODEV;
- if ((dev = __dev_get_by_name(net, r.arp_dev)) == NULL)
+ dev = __dev_get_by_name(net, r.arp_dev);
+ if (dev == NULL)
goto out;
/* Mmmm... It is wrong... ARPHRD_NETROM==0 */
@@ -1243,7 +1261,8 @@ out:
return err;
}
-static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+static int arp_netdev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
{
struct net_device *dev = ptr;
@@ -1311,12 +1330,13 @@ static char *ax2asc2(ax25_address *a, char *buf)
for (n = 0, s = buf; n < 6; n++) {
c = (a->ax25_call[n] >> 1) & 0x7F;
- if (c != ' ') *s++ = c;
+ if (c != ' ')
+ *s++ = c;
}
*s++ = '-';
-
- if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) {
+ n = (a->ax25_call[6] >> 1) & 0x0F;
+ if (n > 9) {
*s++ = '1';
n -= 10;
}
@@ -1325,10 +1345,9 @@ static char *ax2asc2(ax25_address *a, char *buf)
*s++ = '\0';
if (*buf == '\0' || *buf == '-')
- return "*";
+ return "*";
return buf;
-
}
#endif /* CONFIG_AX25 */
@@ -1408,10 +1427,10 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
/* ------------------------------------------------------------------------ */
static const struct seq_operations arp_seq_ops = {
- .start = arp_seq_start,
- .next = neigh_seq_next,
- .stop = neigh_seq_stop,
- .show = arp_seq_show,
+ .start = arp_seq_start,
+ .next = neigh_seq_next,
+ .stop = neigh_seq_stop,
+ .show = arp_seq_show,
};
static int arp_seq_open(struct inode *inode, struct file *file)
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index f0550941df7..174be6caa5c 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -62,14 +62,17 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
}
if (!inet->inet_saddr)
inet->inet_saddr = rt->rt_src; /* Update source address */
- if (!inet->inet_rcv_saddr)
+ if (!inet->inet_rcv_saddr) {
inet->inet_rcv_saddr = rt->rt_src;
+ if (sk->sk_prot->rehash)
+ sk->sk_prot->rehash(sk);
+ }
inet->inet_daddr = rt->rt_dst;
inet->inet_dport = usin->sin_port;
sk->sk_state = TCP_ESTABLISHED;
inet->inet_id = jiffies;
sk_dst_set(sk, &rt->dst);
- return(0);
+ return 0;
}
EXPORT_SYMBOL(ip4_datagram_connect);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index da14c49284f..dc94b0316b7 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -209,7 +209,7 @@ static void inetdev_destroy(struct in_device *in_dev)
inet_free_ifa(ifa);
}
- dev->ip_ptr = NULL;
+ rcu_assign_pointer(dev->ip_ptr, NULL);
devinet_sysctl_unregister(in_dev);
neigh_parms_release(&arp_tbl, in_dev->arp_parms);
@@ -403,6 +403,9 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
return inet_insert_ifa(ifa);
}
+/* Caller must hold RCU or RTNL :
+ * We dont take a reference on found in_device
+ */
struct in_device *inetdev_by_index(struct net *net, int ifindex)
{
struct net_device *dev;
@@ -411,7 +414,7 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex)
rcu_read_lock();
dev = dev_get_by_index_rcu(net, ifindex);
if (dev)
- in_dev = in_dev_get(dev);
+ in_dev = rcu_dereference_rtnl(dev->ip_ptr);
rcu_read_unlock();
return in_dev;
}
@@ -453,8 +456,6 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
goto errout;
}
- __in_dev_put(in_dev);
-
for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
ifap = &ifa->ifa_next) {
if (tb[IFA_LOCAL] &&
@@ -1059,7 +1060,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
switch (event) {
case NETDEV_REGISTER:
printk(KERN_DEBUG "inetdev_event: bug\n");
- dev->ip_ptr = NULL;
+ rcu_assign_pointer(dev->ip_ptr, NULL);
break;
case NETDEV_UP:
if (!inetdev_valid_mtu(dev->mtu))
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index a4396891835..36e27c2107d 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -147,35 +147,43 @@ static void fib_flush(struct net *net)
rt_cache_flush(net, -1);
}
-/*
- * Find the first device with a given source address.
+/**
+ * __ip_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @devref: if true, take a reference on the found device
+ *
+ * If a caller uses devref=false, it should be protected by RCU, or RTNL
*/
-
-struct net_device * ip_dev_find(struct net *net, __be32 addr)
+struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
{
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
- struct fib_result res;
+ struct flowi fl = {
+ .nl_u = {
+ .ip4_u = {
+ .daddr = addr
+ }
+ },
+ .flags = FLOWI_FLAG_MATCH_ANY_IIF
+ };
+ struct fib_result res = { 0 };
struct net_device *dev = NULL;
- struct fib_table *local_table;
-
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- res.r = NULL;
-#endif
- local_table = fib_get_table(net, RT_TABLE_LOCAL);
- if (!local_table || fib_table_lookup(local_table, &fl, &res))
+ rcu_read_lock();
+ if (fib_lookup(net, &fl, &res)) {
+ rcu_read_unlock();
return NULL;
+ }
if (res.type != RTN_LOCAL)
goto out;
dev = FIB_RES_DEV(res);
- if (dev)
+ if (dev && devref)
dev_hold(dev);
out:
- fib_res_put(&res);
+ rcu_read_unlock();
return dev;
}
-EXPORT_SYMBOL(ip_dev_find);
+EXPORT_SYMBOL(__ip_dev_find);
/*
* Find address type as if only "dev" was present in the system. If
@@ -202,11 +210,12 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
local_table = fib_get_table(net, RT_TABLE_LOCAL);
if (local_table) {
ret = RTN_UNICAST;
- if (!fib_table_lookup(local_table, &fl, &res)) {
+ rcu_read_lock();
+ if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
if (!dev || dev == res.fi->fib_dev)
ret = res.type;
- fib_res_put(&res);
}
+ rcu_read_unlock();
}
return ret;
}
@@ -220,37 +229,41 @@ EXPORT_SYMBOL(inet_addr_type);
unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
__be32 addr)
{
- return __inet_dev_addr_type(net, dev, addr);
+ return __inet_dev_addr_type(net, dev, addr);
}
EXPORT_SYMBOL(inet_dev_addr_type);
/* Given (packet source, input interface) and optional (dst, oif, tos):
- - (main) check, that source is valid i.e. not broadcast or our local
- address.
- - figure out what "logical" interface this packet arrived
- and calculate "specific destination" address.
- - check, that packet arrived from expected physical interface.
+ * - (main) check, that source is valid i.e. not broadcast or our local
+ * address.
+ * - figure out what "logical" interface this packet arrived
+ * and calculate "specific destination" address.
+ * - check, that packet arrived from expected physical interface.
+ * called with rcu_read_lock()
*/
-
int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
struct net_device *dev, __be32 *spec_dst,
u32 *itag, u32 mark)
{
struct in_device *in_dev;
- struct flowi fl = { .nl_u = { .ip4_u =
- { .daddr = src,
- .saddr = dst,
- .tos = tos } },
- .mark = mark,
- .iif = oif };
-
+ struct flowi fl = {
+ .nl_u = {
+ .ip4_u = {
+ .daddr = src,
+ .saddr = dst,
+ .tos = tos
+ }
+ },
+ .mark = mark,
+ .iif = oif
+ };
struct fib_result res;
int no_addr, rpf, accept_local;
+ bool dev_match;
int ret;
struct net *net;
no_addr = rpf = accept_local = 0;
- rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
if (in_dev) {
no_addr = in_dev->ifa_list == NULL;
@@ -259,7 +272,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
if (mark && !IN_DEV_SRC_VMARK(in_dev))
fl.mark = 0;
}
- rcu_read_unlock();
if (in_dev == NULL)
goto e_inval;
@@ -269,21 +281,29 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
goto last_resort;
if (res.type != RTN_UNICAST) {
if (res.type != RTN_LOCAL || !accept_local)
- goto e_inval_res;
+ goto e_inval;
}
*spec_dst = FIB_RES_PREFSRC(res);
fib_combine_itag(itag, &res);
+ dev_match = false;
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
+ for (ret = 0; ret < res.fi->fib_nhs; ret++) {
+ struct fib_nh *nh = &res.fi->fib_nh[ret];
+
+ if (nh->nh_dev == dev) {
+ dev_match = true;
+ break;
+ }
+ }
#else
if (FIB_RES_DEV(res) == dev)
+ dev_match = true;
#endif
- {
+ if (dev_match) {
ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
- fib_res_put(&res);
return ret;
}
- fib_res_put(&res);
if (no_addr)
goto last_resort;
if (rpf == 1)
@@ -296,7 +316,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
*spec_dst = FIB_RES_PREFSRC(res);
ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
}
- fib_res_put(&res);
}
return ret;
@@ -307,8 +326,6 @@ last_resort:
*itag = 0;
return 0;
-e_inval_res:
- fib_res_put(&res);
e_inval:
return -EINVAL;
e_rpf:
@@ -461,9 +478,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
}
/*
- * Handle IP routing ioctl calls. These are used to manipulate the routing tables
+ * Handle IP routing ioctl calls.
+ * These are used to manipulate the routing tables
*/
-
int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
{
struct fib_config cfg;
@@ -507,7 +524,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
return -EINVAL;
}
-const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
+const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_DST] = { .type = NLA_U32 },
[RTA_SRC] = { .type = NLA_U32 },
[RTA_IIF] = { .type = NLA_U32 },
@@ -521,7 +538,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
};
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct fib_config *cfg)
+ struct nlmsghdr *nlh, struct fib_config *cfg)
{
struct nlattr *attr;
int err, remaining;
@@ -676,12 +693,11 @@ out:
}
/* Prepare and feed intra-kernel routing request.
- Really, it should be netlink message, but :-( netlink
- can be not configured, so that we feed it directly
- to fib engine. It is legal, because all events occur
- only when netlink is already locked.
+ * Really, it should be netlink message, but :-( netlink
+ * can be not configured, so that we feed it directly
+ * to fib engine. It is legal, because all events occur
+ * only when netlink is already locked.
*/
-
static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
{
struct net *net = dev_net(ifa->ifa_dev->dev);
@@ -727,9 +743,9 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
struct in_ifaddr *prim = ifa;
__be32 mask = ifa->ifa_mask;
__be32 addr = ifa->ifa_local;
- __be32 prefix = ifa->ifa_address&mask;
+ __be32 prefix = ifa->ifa_address & mask;
- if (ifa->ifa_flags&IFA_F_SECONDARY) {
+ if (ifa->ifa_flags & IFA_F_SECONDARY) {
prim = inet_ifa_byprefix(in_dev, prefix, mask);
if (prim == NULL) {
printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
@@ -739,22 +755,24 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
- if (!(dev->flags&IFF_UP))
+ if (!(dev->flags & IFF_UP))
return;
/* Add broadcast address, if it is explicitly assigned. */
if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
- if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
+ if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
(prefix != addr || ifa->ifa_prefixlen < 32)) {
- fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
- RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
+ fib_magic(RTM_NEWROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, prim);
/* Add network specific broadcasts, when it takes a sense */
if (ifa->ifa_prefixlen < 31) {
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
- fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
+ 32, prim);
}
}
}
@@ -765,17 +783,18 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
struct net_device *dev = in_dev->dev;
struct in_ifaddr *ifa1;
struct in_ifaddr *prim = ifa;
- __be32 brd = ifa->ifa_address|~ifa->ifa_mask;
- __be32 any = ifa->ifa_address&ifa->ifa_mask;
+ __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
+ __be32 any = ifa->ifa_address & ifa->ifa_mask;
#define LOCAL_OK 1
#define BRD_OK 2
#define BRD0_OK 4
#define BRD1_OK 8
unsigned ok = 0;
- if (!(ifa->ifa_flags&IFA_F_SECONDARY))
- fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
- RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY))
+ fib_magic(RTM_DELROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ any, ifa->ifa_prefixlen, prim);
else {
prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
if (prim == NULL) {
@@ -785,9 +804,9 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
}
/* Deletion is more complicated than add.
- We should take care of not to delete too much :-)
-
- Scan address list to be sure that addresses are really gone.
+ * We should take care of not to delete too much :-)
+ *
+ * Scan address list to be sure that addresses are really gone.
*/
for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
@@ -801,23 +820,23 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
ok |= BRD0_OK;
}
- if (!(ok&BRD_OK))
+ if (!(ok & BRD_OK))
fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
- if (!(ok&BRD1_OK))
+ if (!(ok & BRD1_OK))
fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
- if (!(ok&BRD0_OK))
+ if (!(ok & BRD0_OK))
fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
- if (!(ok&LOCAL_OK)) {
+ if (!(ok & LOCAL_OK)) {
fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
/* Check, that this local address finally disappeared. */
if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
/* And the last, but not the least thing.
- We must flush stray FIB entries.
-
- First of all, we scan fib_info list searching
- for stray nexthop entries, then ignite fib_flush.
- */
+ * We must flush stray FIB entries.
+ *
+ * First of all, we scan fib_info list searching
+ * for stray nexthop entries, then ignite fib_flush.
+ */
if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
fib_flush(dev_net(dev));
}
@@ -828,14 +847,20 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
#undef BRD1_OK
}
-static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
+static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
{
struct fib_result res;
- struct flowi fl = { .mark = frn->fl_mark,
- .nl_u = { .ip4_u = { .daddr = frn->fl_addr,
- .tos = frn->fl_tos,
- .scope = frn->fl_scope } } };
+ struct flowi fl = {
+ .mark = frn->fl_mark,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = frn->fl_addr,
+ .tos = frn->fl_tos,
+ .scope = frn->fl_scope
+ }
+ }
+ };
#ifdef CONFIG_IP_MULTIPLE_TABLES
res.r = NULL;
@@ -846,15 +871,16 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
local_bh_disable();
frn->tb_id = tb->tb_id;
- frn->err = fib_table_lookup(tb, &fl, &res);
+ rcu_read_lock();
+ frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF);
if (!frn->err) {
frn->prefixlen = res.prefixlen;
frn->nh_sel = res.nh_sel;
frn->type = res.type;
frn->scope = res.scope;
- fib_res_put(&res);
}
+ rcu_read_unlock();
local_bh_enable();
}
}
@@ -883,8 +909,8 @@ static void nl_fib_input(struct sk_buff *skb)
nl_fib_lookup(frn, tb);
- pid = NETLINK_CB(skb).pid; /* pid of sending process */
- NETLINK_CB(skb).pid = 0; /* from kernel */
+ pid = NETLINK_CB(skb).pid; /* pid of sending process */
+ NETLINK_CB(skb).pid = 0; /* from kernel */
NETLINK_CB(skb).dst_group = 0; /* unicast */
netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
}
@@ -931,7 +957,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
fib_del_ifaddr(ifa);
if (ifa->ifa_dev->ifa_list == NULL) {
/* Last address was deleted from this interface.
- Disable IP.
+ * Disable IP.
*/
fib_disable_ip(dev, 1, 0);
} else {
@@ -990,16 +1016,15 @@ static struct notifier_block fib_netdev_notifier = {
static int __net_init ip_fib_net_init(struct net *net)
{
int err;
- unsigned int i;
+ size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
- net->ipv4.fib_table_hash = kzalloc(
- sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL);
+ /* Avoid false sharing : Use at least a full cache line */
+ size = max_t(size_t, size, L1_CACHE_BYTES);
+
+ net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
if (net->ipv4.fib_table_hash == NULL)
return -ENOMEM;
- for (i = 0; i < FIB_TABLE_HASHSZ; i++)
- INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]);
-
err = fib4_rules_init(net);
if (err < 0)
goto fail;
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 4ed7e0dea1b..43e1c594ce8 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -54,36 +54,37 @@ struct fib_node {
struct fib_alias fn_embedded_alias;
};
-struct fn_zone {
- struct fn_zone *fz_next; /* Next not empty zone */
- struct hlist_head *fz_hash; /* Hash table pointer */
- int fz_nent; /* Number of entries */
+#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
- int fz_divisor; /* Hash divisor */
+struct fn_zone {
+ struct fn_zone __rcu *fz_next; /* Next not empty zone */
+ struct hlist_head __rcu *fz_hash; /* Hash table pointer */
+ seqlock_t fz_lock;
u32 fz_hashmask; /* (fz_divisor - 1) */
-#define FZ_HASHMASK(fz) ((fz)->fz_hashmask)
- int fz_order; /* Zone order */
- __be32 fz_mask;
+ u8 fz_order; /* Zone order (0..32) */
+ u8 fz_revorder; /* 32 - fz_order */
+ __be32 fz_mask; /* inet_make_mask(order) */
#define FZ_MASK(fz) ((fz)->fz_mask)
-};
-/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask
- * can be cheaper than memory lookup, so that FZ_* macros are used.
- */
+ struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE];
+
+ int fz_nent; /* Number of entries */
+ int fz_divisor; /* Hash size (mask+1) */
+};
struct fn_hash {
- struct fn_zone *fn_zones[33];
- struct fn_zone *fn_zone_list;
+ struct fn_zone *fn_zones[33];
+ struct fn_zone __rcu *fn_zone_list;
};
static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
{
- u32 h = ntohl(key)>>(32 - fz->fz_order);
+ u32 h = ntohl(key) >> fz->fz_revorder;
h ^= (h>>20);
h ^= (h>>10);
h ^= (h>>5);
- h &= FZ_HASHMASK(fz);
+ h &= fz->fz_hashmask;
return h;
}
@@ -92,7 +93,6 @@ static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
return dst & FZ_MASK(fz);
}
-static DEFINE_RWLOCK(fib_hash_lock);
static unsigned int fib_hash_genid;
#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
@@ -101,12 +101,11 @@ static struct hlist_head *fz_hash_alloc(int divisor)
{
unsigned long size = divisor * sizeof(struct hlist_head);
- if (size <= PAGE_SIZE) {
+ if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL);
- } else {
- return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
- }
+
+ return (struct hlist_head *)
+ __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
}
/* The fib hash lock must be held when this is called. */
@@ -121,12 +120,12 @@ static inline void fn_rebuild_zone(struct fn_zone *fz,
struct fib_node *f;
hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
- struct hlist_head *new_head;
+ struct hlist_head __rcu *new_head;
- hlist_del(&f->fn_hash);
+ hlist_del_rcu(&f->fn_hash);
new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
- hlist_add_head(&f->fn_hash, new_head);
+ hlist_add_head_rcu(&f->fn_hash, new_head);
}
}
}
@@ -147,14 +146,14 @@ static void fn_rehash_zone(struct fn_zone *fz)
int old_divisor, new_divisor;
u32 new_hashmask;
- old_divisor = fz->fz_divisor;
+ new_divisor = old_divisor = fz->fz_divisor;
switch (old_divisor) {
- case 16:
- new_divisor = 256;
+ case EMBEDDED_HASH_SIZE:
+ new_divisor *= EMBEDDED_HASH_SIZE;
break;
- case 256:
- new_divisor = 1024;
+ case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
+ new_divisor *= (EMBEDDED_HASH_SIZE/2);
break;
default:
if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
@@ -175,31 +174,55 @@ static void fn_rehash_zone(struct fn_zone *fz)
ht = fz_hash_alloc(new_divisor);
if (ht) {
- write_lock_bh(&fib_hash_lock);
+ struct fn_zone nfz;
+
+ memcpy(&nfz, fz, sizeof(nfz));
+
+ write_seqlock_bh(&fz->fz_lock);
old_ht = fz->fz_hash;
- fz->fz_hash = ht;
+ nfz.fz_hash = ht;
+ nfz.fz_hashmask = new_hashmask;
+ nfz.fz_divisor = new_divisor;
+ fn_rebuild_zone(&nfz, old_ht, old_divisor);
+ fib_hash_genid++;
+ rcu_assign_pointer(fz->fz_hash, ht);
fz->fz_hashmask = new_hashmask;
fz->fz_divisor = new_divisor;
- fn_rebuild_zone(fz, old_ht, old_divisor);
- fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
+ write_sequnlock_bh(&fz->fz_lock);
- fz_hash_free(old_ht, old_divisor);
+ if (old_ht != fz->fz_embedded_hash) {
+ synchronize_rcu();
+ fz_hash_free(old_ht, old_divisor);
+ }
}
}
-static inline void fn_free_node(struct fib_node * f)
+static void fn_free_node_rcu(struct rcu_head *head)
{
+ struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
+
kmem_cache_free(fn_hash_kmem, f);
}
+static inline void fn_free_node(struct fib_node *f)
+{
+ call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
+}
+
+static void fn_free_alias_rcu(struct rcu_head *head)
+{
+ struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
+
+ kmem_cache_free(fn_alias_kmem, fa);
+}
+
static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
{
fib_release_info(fa->fa_info);
if (fa == &f->fn_embedded_alias)
fa->fa_info = NULL;
else
- kmem_cache_free(fn_alias_kmem, fa);
+ call_rcu(&fa->rcu, fn_free_alias_rcu);
}
static struct fn_zone *
@@ -210,68 +233,71 @@ fn_new_zone(struct fn_hash *table, int z)
if (!fz)
return NULL;
- if (z) {
- fz->fz_divisor = 16;
- } else {
- fz->fz_divisor = 1;
- }
- fz->fz_hashmask = (fz->fz_divisor - 1);
- fz->fz_hash = fz_hash_alloc(fz->fz_divisor);
- if (!fz->fz_hash) {
- kfree(fz);
- return NULL;
- }
+ seqlock_init(&fz->fz_lock);
+ fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
+ fz->fz_hashmask = fz->fz_divisor - 1;
+ fz->fz_hash = fz->fz_embedded_hash;
fz->fz_order = z;
+ fz->fz_revorder = 32 - z;
fz->fz_mask = inet_make_mask(z);
/* Find the first not empty zone with more specific mask */
- for (i=z+1; i<=32; i++)
+ for (i = z + 1; i <= 32; i++)
if (table->fn_zones[i])
break;
- write_lock_bh(&fib_hash_lock);
- if (i>32) {
+ if (i > 32) {
/* No more specific masks, we are the first. */
- fz->fz_next = table->fn_zone_list;
- table->fn_zone_list = fz;
+ rcu_assign_pointer(fz->fz_next,
+ rtnl_dereference(table->fn_zone_list));
+ rcu_assign_pointer(table->fn_zone_list, fz);
} else {
- fz->fz_next = table->fn_zones[i]->fz_next;
- table->fn_zones[i]->fz_next = fz;
+ rcu_assign_pointer(fz->fz_next,
+ rtnl_dereference(table->fn_zones[i]->fz_next));
+ rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
}
table->fn_zones[z] = fz;
fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
return fz;
}
int fib_table_lookup(struct fib_table *tb,
- const struct flowi *flp, struct fib_result *res)
+ const struct flowi *flp, struct fib_result *res,
+ int fib_flags)
{
int err;
struct fn_zone *fz;
struct fn_hash *t = (struct fn_hash *)tb->tb_data;
- read_lock(&fib_hash_lock);
- for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
- struct hlist_head *head;
+ rcu_read_lock();
+ for (fz = rcu_dereference(t->fn_zone_list);
+ fz != NULL;
+ fz = rcu_dereference(fz->fz_next)) {
+ struct hlist_head __rcu *head;
struct hlist_node *node;
struct fib_node *f;
- __be32 k = fz_key(flp->fl4_dst, fz);
+ __be32 k;
+ unsigned int seq;
- head = &fz->fz_hash[fn_hash(k, fz)];
- hlist_for_each_entry(f, node, head, fn_hash) {
- if (f->fn_key != k)
- continue;
+ do {
+ seq = read_seqbegin(&fz->fz_lock);
+ k = fz_key(flp->fl4_dst, fz);
+
+ head = &fz->fz_hash[fn_hash(k, fz)];
+ hlist_for_each_entry_rcu(f, node, head, fn_hash) {
+ if (f->fn_key != k)
+ continue;
- err = fib_semantic_match(&f->fn_alias,
+ err = fib_semantic_match(&f->fn_alias,
flp, res,
- fz->fz_order);
- if (err <= 0)
- goto out;
- }
+ fz->fz_order, fib_flags);
+ if (err <= 0)
+ goto out;
+ }
+ } while (read_seqretry(&fz->fz_lock, seq));
}
err = 1;
out:
- read_unlock(&fib_hash_lock);
+ rcu_read_unlock();
return err;
}
@@ -293,11 +319,11 @@ void fib_table_select_default(struct fib_table *tb,
last_resort = NULL;
order = -1;
- read_lock(&fib_hash_lock);
- hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(f, node, &fz->fz_hash[0], fn_hash) {
struct fib_alias *fa;
- list_for_each_entry(fa, &f->fn_alias, fa_list) {
+ list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
struct fib_info *next_fi = fa->fa_info;
if (fa->fa_scope != res->scope ||
@@ -309,7 +335,8 @@ void fib_table_select_default(struct fib_table *tb,
if (!next_fi->fib_nh[0].nh_gw ||
next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
continue;
- fa->fa_state |= FA_S_ACCESSED;
+
+ fib_alias_accessed(fa);
if (fi == NULL) {
if (next_fi != res->fi)
@@ -341,7 +368,7 @@ void fib_table_select_default(struct fib_table *tb,
fib_result_assign(res, last_resort);
tb->tb_default = last_idx;
out:
- read_unlock(&fib_hash_lock);
+ rcu_read_unlock();
}
/* Insert node F to FZ. */
@@ -349,7 +376,7 @@ static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
{
struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
- hlist_add_head(&f->fn_hash, head);
+ hlist_add_head_rcu(&f->fn_hash, head);
}
/* Return the node in FZ matching KEY. */
@@ -359,7 +386,7 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
struct hlist_node *node;
struct fib_node *f;
- hlist_for_each_entry(f, node, head, fn_hash) {
+ hlist_for_each_entry_rcu(f, node, head, fn_hash) {
if (f->fn_key == key)
return f;
}
@@ -367,6 +394,17 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
return NULL;
}
+
+static struct fib_alias *fib_fast_alloc(struct fib_node *f)
+{
+ struct fib_alias *fa = &f->fn_embedded_alias;
+
+ if (fa->fa_info != NULL)
+ fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+ return fa;
+}
+
+/* Caller must hold RTNL. */
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
{
struct fn_hash *table = (struct fn_hash *) tb->tb_data;
@@ -451,7 +489,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
}
if (cfg->fc_nlflags & NLM_F_REPLACE) {
- struct fib_info *fi_drop;
u8 state;
fa = fa_first;
@@ -460,21 +497,25 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
err = 0;
goto out;
}
- write_lock_bh(&fib_hash_lock);
- fi_drop = fa->fa_info;
- fa->fa_info = fi;
- fa->fa_type = cfg->fc_type;
- fa->fa_scope = cfg->fc_scope;
+ err = -ENOBUFS;
+ new_fa = fib_fast_alloc(f);
+ if (new_fa == NULL)
+ goto out;
+
+ new_fa->fa_tos = fa->fa_tos;
+ new_fa->fa_info = fi;
+ new_fa->fa_type = cfg->fc_type;
+ new_fa->fa_scope = cfg->fc_scope;
state = fa->fa_state;
- fa->fa_state &= ~FA_S_ACCESSED;
+ new_fa->fa_state = state & ~FA_S_ACCESSED;
fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
+ list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
- fib_release_info(fi_drop);
+ fn_free_alias(fa, f);
if (state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
- rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id,
- &cfg->fc_nlinfo, NLM_F_REPLACE);
+ rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
+ tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
return 0;
}
@@ -506,12 +547,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
f = new_f;
}
- new_fa = &f->fn_embedded_alias;
- if (new_fa->fa_info != NULL) {
- new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
- if (new_fa == NULL)
- goto out;
- }
+ new_fa = fib_fast_alloc(f);
+ if (new_fa == NULL)
+ goto out;
+
new_fa->fa_info = fi;
new_fa->fa_tos = tos;
new_fa->fa_type = cfg->fc_type;
@@ -522,13 +561,11 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
* Insert new entry to the list.
*/
- write_lock_bh(&fib_hash_lock);
if (new_f)
fib_insert_node(fz, new_f);
- list_add_tail(&new_fa->fa_list,
+ list_add_tail_rcu(&new_fa->fa_list,
(fa ? &fa->fa_list : &f->fn_alias));
fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
if (new_f)
fz->fz_nent++;
@@ -603,14 +640,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
tb->tb_id, &cfg->fc_nlinfo, 0);
kill_fn = 0;
- write_lock_bh(&fib_hash_lock);
- list_del(&fa->fa_list);
+ list_del_rcu(&fa->fa_list);
if (list_empty(&f->fn_alias)) {
- hlist_del(&f->fn_hash);
+ hlist_del_rcu(&f->fn_hash);
kill_fn = 1;
}
fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
if (fa->fa_state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
@@ -641,14 +676,12 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
struct fib_info *fi = fa->fa_info;
if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
- write_lock_bh(&fib_hash_lock);
- list_del(&fa->fa_list);
+ list_del_rcu(&fa->fa_list);
if (list_empty(&f->fn_alias)) {
- hlist_del(&f->fn_hash);
+ hlist_del_rcu(&f->fn_hash);
kill_f = 1;
}
fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
fn_free_alias(fa, f);
found++;
@@ -662,13 +695,16 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
return found;
}
+/* caller must hold RTNL. */
int fib_table_flush(struct fib_table *tb)
{
struct fn_hash *table = (struct fn_hash *) tb->tb_data;
struct fn_zone *fz;
int found = 0;
- for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
+ for (fz = rtnl_dereference(table->fn_zone_list);
+ fz != NULL;
+ fz = rtnl_dereference(fz->fz_next)) {
int i;
for (i = fz->fz_divisor - 1; i >= 0; i--)
@@ -690,10 +726,10 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
s_i = cb->args[4];
i = 0;
- hlist_for_each_entry(f, node, head, fn_hash) {
+ hlist_for_each_entry_rcu(f, node, head, fn_hash) {
struct fib_alias *fa;
- list_for_each_entry(fa, &f->fn_alias, fa_list) {
+ list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
if (i < s_i)
goto next;
@@ -711,7 +747,7 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
cb->args[4] = i;
return -1;
}
- next:
+next:
i++;
}
}
@@ -746,23 +782,26 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
struct netlink_callback *cb)
{
- int m, s_m;
+ int m = 0, s_m;
struct fn_zone *fz;
struct fn_hash *table = (struct fn_hash *)tb->tb_data;
s_m = cb->args[2];
- read_lock(&fib_hash_lock);
- for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
- if (m < s_m) continue;
+ rcu_read_lock();
+ for (fz = rcu_dereference(table->fn_zone_list);
+ fz != NULL;
+ fz = rcu_dereference(fz->fz_next), m++) {
+ if (m < s_m)
+ continue;
if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
cb->args[2] = m;
- read_unlock(&fib_hash_lock);
+ rcu_read_unlock();
return -1;
}
memset(&cb->args[3], 0,
sizeof(cb->args) - 3*sizeof(cb->args[0]));
}
- read_unlock(&fib_hash_lock);
+ rcu_read_unlock();
cb->args[2] = m;
return skb->len;
}
@@ -825,8 +864,9 @@ static struct fib_alias *fib_get_first(struct seq_file *seq)
iter->genid = fib_hash_genid;
iter->valid = 1;
- for (iter->zone = table->fn_zone_list; iter->zone;
- iter->zone = iter->zone->fz_next) {
+ for (iter->zone = rcu_dereference(table->fn_zone_list);
+ iter->zone != NULL;
+ iter->zone = rcu_dereference(iter->zone->fz_next)) {
int maxslot;
if (!iter->zone->fz_nent)
@@ -911,7 +951,7 @@ static struct fib_alias *fib_get_next(struct seq_file *seq)
}
}
- iter->zone = iter->zone->fz_next;
+ iter->zone = rcu_dereference(iter->zone->fz_next);
if (!iter->zone)
goto out;
@@ -950,11 +990,11 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
}
static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(fib_hash_lock)
+ __acquires(RCU)
{
void *v = NULL;
- read_lock(&fib_hash_lock);
+ rcu_read_lock();
if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
return v;
@@ -967,15 +1007,16 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void fib_seq_stop(struct seq_file *seq, void *v)
- __releases(fib_hash_lock)
+ __releases(RCU)
{
- read_unlock(&fib_hash_lock);
+ rcu_read_unlock();
}
static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
{
static const unsigned type2flags[RTN_MAX + 1] = {
- [7] = RTF_REJECT, [8] = RTF_REJECT,
+ [7] = RTF_REJECT,
+ [8] = RTF_REJECT,
};
unsigned flags = type2flags[type];
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 637b133973b..a29edf2219c 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -12,17 +12,22 @@ struct fib_alias {
u8 fa_type;
u8 fa_scope;
u8 fa_state;
-#ifdef CONFIG_IP_FIB_TRIE
struct rcu_head rcu;
-#endif
};
#define FA_S_ACCESSED 0x01
+/* Dont write on fa_state unless needed, to keep it shared on all cpus */
+static inline void fib_alias_accessed(struct fib_alias *fa)
+{
+ if (!(fa->fa_state & FA_S_ACCESSED))
+ fa->fa_state |= FA_S_ACCESSED;
+}
+
/* Exported by fib_semantics.c */
extern int fib_semantic_match(struct list_head *head,
const struct flowi *flp,
- struct fib_result *res, int prefixlen);
+ struct fib_result *res, int prefixlen, int fib_flags);
extern void fib_release_info(struct fib_info *);
extern struct fib_info *fib_create_info(struct fib_config *cfg);
extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 76daeb5ff56..7981a24f5c7 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -6,7 +6,7 @@
* IPv4 Forwarding Information Base: policy rules.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- * Thomas Graf <tgraf@suug.ch>
+ * Thomas Graf <tgraf@suug.ch>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -14,7 +14,7 @@
* 2 of the License, or (at your option) any later version.
*
* Fixes:
- * Rani Assaf : local_rule cannot be deleted
+ * Rani Assaf : local_rule cannot be deleted
* Marc Boucher : routing by fwmark
*/
@@ -32,8 +32,7 @@
#include <net/ip_fib.h>
#include <net/fib_rules.h>
-struct fib4_rule
-{
+struct fib4_rule {
struct fib_rule common;
u8 dst_len;
u8 src_len;
@@ -58,6 +57,7 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
{
struct fib_lookup_arg arg = {
.result = res,
+ .flags = FIB_LOOKUP_NOREF,
};
int err;
@@ -91,10 +91,11 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
goto errout;
}
- if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL)
+ tbl = fib_get_table(rule->fr_net, rule->table);
+ if (!tbl)
goto errout;
- err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result);
+ err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags);
if (err > 0)
err = -EAGAIN;
errout:
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 20f09c5b31e..3e0da3ef611 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -60,21 +60,30 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
static DEFINE_SPINLOCK(fib_multipath_lock);
-#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
-for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
-
-#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \
-for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++)
+#define for_nexthops(fi) { \
+ int nhsel; const struct fib_nh *nh; \
+ for (nhsel = 0, nh = (fi)->fib_nh; \
+ nhsel < (fi)->fib_nhs; \
+ nh++, nhsel++)
+
+#define change_nexthops(fi) { \
+ int nhsel; struct fib_nh *nexthop_nh; \
+ for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
+ nhsel < (fi)->fib_nhs; \
+ nexthop_nh++, nhsel++)
#else /* CONFIG_IP_ROUTE_MULTIPATH */
/* Hope, that gcc will optimize it to get rid of dummy loop */
-#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
-for (nhsel=0; nhsel < 1; nhsel++)
+#define for_nexthops(fi) { \
+ int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \
+ for (nhsel = 0; nhsel < 1; nhsel++)
-#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
-for (nhsel=0; nhsel < 1; nhsel++)
+#define change_nexthops(fi) { \
+ int nhsel; \
+ struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
+ for (nhsel = 0; nhsel < 1; nhsel++)
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
@@ -86,63 +95,70 @@ static const struct
int error;
u8 scope;
} fib_props[RTN_MAX + 1] = {
- {
+ [RTN_UNSPEC] = {
.error = 0,
.scope = RT_SCOPE_NOWHERE,
- }, /* RTN_UNSPEC */
- {
+ },
+ [RTN_UNICAST] = {
.error = 0,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_UNICAST */
- {
+ },
+ [RTN_LOCAL] = {
.error = 0,
.scope = RT_SCOPE_HOST,
- }, /* RTN_LOCAL */
- {
+ },
+ [RTN_BROADCAST] = {
.error = 0,
.scope = RT_SCOPE_LINK,
- }, /* RTN_BROADCAST */
- {
+ },
+ [RTN_ANYCAST] = {
.error = 0,
.scope = RT_SCOPE_LINK,
- }, /* RTN_ANYCAST */
- {
+ },
+ [RTN_MULTICAST] = {
.error = 0,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_MULTICAST */
- {
+ },
+ [RTN_BLACKHOLE] = {
.error = -EINVAL,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_BLACKHOLE */
- {
+ },
+ [RTN_UNREACHABLE] = {
.error = -EHOSTUNREACH,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_UNREACHABLE */
- {
+ },
+ [RTN_PROHIBIT] = {
.error = -EACCES,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_PROHIBIT */
- {
+ },
+ [RTN_THROW] = {
.error = -EAGAIN,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_THROW */
- {
+ },
+ [RTN_NAT] = {
.error = -EINVAL,
.scope = RT_SCOPE_NOWHERE,
- }, /* RTN_NAT */
- {
+ },
+ [RTN_XRESOLVE] = {
.error = -EINVAL,
.scope = RT_SCOPE_NOWHERE,
- }, /* RTN_XRESOLVE */
+ },
};
/* Release a nexthop info record */
+static void free_fib_info_rcu(struct rcu_head *head)
+{
+ struct fib_info *fi = container_of(head, struct fib_info, rcu);
+
+ kfree(fi);
+}
+
void free_fib_info(struct fib_info *fi)
{
if (fi->fib_dead == 0) {
- printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
+ pr_warning("Freeing alive fib_info %p\n", fi);
return;
}
change_nexthops(fi) {
@@ -152,7 +168,7 @@ void free_fib_info(struct fib_info *fi)
} endfor_nexthops(fi);
fib_info_cnt--;
release_net(fi->fib_net);
- kfree(fi);
+ call_rcu(&fi->rcu, free_fib_info_rcu);
}
void fib_release_info(struct fib_info *fi)
@@ -173,7 +189,7 @@ void fib_release_info(struct fib_info *fi)
spin_unlock_bh(&fib_info_lock);
}
-static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
{
const struct fib_nh *onh = ofi->fib_nh;
@@ -187,7 +203,7 @@ static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *
#ifdef CONFIG_NET_CLS_ROUTE
nh->nh_tclassid != onh->nh_tclassid ||
#endif
- ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
+ ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
return -1;
onh++;
} endfor_nexthops(fi);
@@ -238,7 +254,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
nfi->fib_priority == fi->fib_priority &&
memcmp(nfi->fib_metrics, fi->fib_metrics,
sizeof(fi->fib_metrics)) == 0 &&
- ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
+ ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
(nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
return fi;
}
@@ -247,9 +263,8 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
}
/* Check, that the gateway is already configured.
- Used only by redirect accept routine.
+ * Used only by redirect accept routine.
*/
-
int ip_fib_check_default(__be32 gw, struct net_device *dev)
{
struct hlist_head *head;
@@ -264,7 +279,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
hlist_for_each_entry(nh, node, head, nh_hash) {
if (nh->nh_dev == dev &&
nh->nh_gw == gw &&
- !(nh->nh_flags&RTNH_F_DEAD)) {
+ !(nh->nh_flags & RTNH_F_DEAD)) {
spin_unlock(&fib_info_lock);
return 0;
}
@@ -362,10 +377,10 @@ int fib_detect_death(struct fib_info *fi, int order,
}
if (state == NUD_REACHABLE)
return 0;
- if ((state&NUD_VALID) && order != dflt)
+ if ((state & NUD_VALID) && order != dflt)
return 0;
- if ((state&NUD_VALID) ||
- (*last_idx<0 && order > dflt)) {
+ if ((state & NUD_VALID) ||
+ (*last_idx < 0 && order > dflt)) {
*last_resort = fi;
*last_idx = order;
}
@@ -476,75 +491,76 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
/*
- Picture
- -------
-
- Semantics of nexthop is very messy by historical reasons.
- We have to take into account, that:
- a) gateway can be actually local interface address,
- so that gatewayed route is direct.
- b) gateway must be on-link address, possibly
- described not by an ifaddr, but also by a direct route.
- c) If both gateway and interface are specified, they should not
- contradict.
- d) If we use tunnel routes, gateway could be not on-link.
-
- Attempt to reconcile all of these (alas, self-contradictory) conditions
- results in pretty ugly and hairy code with obscure logic.
-
- I chose to generalized it instead, so that the size
- of code does not increase practically, but it becomes
- much more general.
- Every prefix is assigned a "scope" value: "host" is local address,
- "link" is direct route,
- [ ... "site" ... "interior" ... ]
- and "universe" is true gateway route with global meaning.
-
- Every prefix refers to a set of "nexthop"s (gw, oif),
- where gw must have narrower scope. This recursion stops
- when gw has LOCAL scope or if "nexthop" is declared ONLINK,
- which means that gw is forced to be on link.
-
- Code is still hairy, but now it is apparently logically
- consistent and very flexible. F.e. as by-product it allows
- to co-exists in peace independent exterior and interior
- routing processes.
-
- Normally it looks as following.
-
- {universe prefix} -> (gw, oif) [scope link]
- |
- |-> {link prefix} -> (gw, oif) [scope local]
- |
- |-> {local prefix} (terminal node)
+ * Picture
+ * -------
+ *
+ * Semantics of nexthop is very messy by historical reasons.
+ * We have to take into account, that:
+ * a) gateway can be actually local interface address,
+ * so that gatewayed route is direct.
+ * b) gateway must be on-link address, possibly
+ * described not by an ifaddr, but also by a direct route.
+ * c) If both gateway and interface are specified, they should not
+ * contradict.
+ * d) If we use tunnel routes, gateway could be not on-link.
+ *
+ * Attempt to reconcile all of these (alas, self-contradictory) conditions
+ * results in pretty ugly and hairy code with obscure logic.
+ *
+ * I chose to generalized it instead, so that the size
+ * of code does not increase practically, but it becomes
+ * much more general.
+ * Every prefix is assigned a "scope" value: "host" is local address,
+ * "link" is direct route,
+ * [ ... "site" ... "interior" ... ]
+ * and "universe" is true gateway route with global meaning.
+ *
+ * Every prefix refers to a set of "nexthop"s (gw, oif),
+ * where gw must have narrower scope. This recursion stops
+ * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
+ * which means that gw is forced to be on link.
+ *
+ * Code is still hairy, but now it is apparently logically
+ * consistent and very flexible. F.e. as by-product it allows
+ * to co-exists in peace independent exterior and interior
+ * routing processes.
+ *
+ * Normally it looks as following.
+ *
+ * {universe prefix} -> (gw, oif) [scope link]
+ * |
+ * |-> {link prefix} -> (gw, oif) [scope local]
+ * |
+ * |-> {local prefix} (terminal node)
*/
-
static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
struct fib_nh *nh)
{
int err;
struct net *net;
+ struct net_device *dev;
net = cfg->fc_nlinfo.nl_net;
if (nh->nh_gw) {
struct fib_result res;
- if (nh->nh_flags&RTNH_F_ONLINK) {
- struct net_device *dev;
+ if (nh->nh_flags & RTNH_F_ONLINK) {
if (cfg->fc_scope >= RT_SCOPE_LINK)
return -EINVAL;
if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
return -EINVAL;
- if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
+ dev = __dev_get_by_index(net, nh->nh_oif);
+ if (!dev)
return -ENODEV;
- if (!(dev->flags&IFF_UP))
+ if (!(dev->flags & IFF_UP))
return -ENETDOWN;
nh->nh_dev = dev;
dev_hold(dev);
nh->nh_scope = RT_SCOPE_LINK;
return 0;
}
+ rcu_read_lock();
{
struct flowi fl = {
.nl_u = {
@@ -559,50 +575,53 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
/* It is not necessary, but requires a bit of thinking */
if (fl.fl4_scope < RT_SCOPE_LINK)
fl.fl4_scope = RT_SCOPE_LINK;
- if ((err = fib_lookup(net, &fl, &res)) != 0)
+ err = fib_lookup(net, &fl, &res);
+ if (err) {
+ rcu_read_unlock();
return err;
+ }
}
err = -EINVAL;
if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
goto out;
nh->nh_scope = res.scope;
nh->nh_oif = FIB_RES_OIF(res);
- if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
+ nh->nh_dev = dev = FIB_RES_DEV(res);
+ if (!dev)
goto out;
- dev_hold(nh->nh_dev);
- err = -ENETDOWN;
- if (!(nh->nh_dev->flags & IFF_UP))
- goto out;
- err = 0;
-out:
- fib_res_put(&res);
- return err;
+ dev_hold(dev);
+ err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
} else {
struct in_device *in_dev;
- if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
+ if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
return -EINVAL;
+ rcu_read_lock();
+ err = -ENODEV;
in_dev = inetdev_by_index(net, nh->nh_oif);
if (in_dev == NULL)
- return -ENODEV;
- if (!(in_dev->dev->flags&IFF_UP)) {
- in_dev_put(in_dev);
- return -ENETDOWN;
- }
+ goto out;
+ err = -ENETDOWN;
+ if (!(in_dev->dev->flags & IFF_UP))
+ goto out;
nh->nh_dev = in_dev->dev;
dev_hold(nh->nh_dev);
nh->nh_scope = RT_SCOPE_HOST;
- in_dev_put(in_dev);
+ err = 0;
}
- return 0;
+out:
+ rcu_read_unlock();
+ return err;
}
static inline unsigned int fib_laddr_hashfn(__be32 val)
{
unsigned int mask = (fib_hash_size - 1);
- return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
+ return ((__force u32)val ^
+ ((__force u32)val >> 7) ^
+ ((__force u32)val >> 14)) & mask;
}
static struct hlist_head *fib_hash_alloc(int bytes)
@@ -611,7 +630,8 @@ static struct hlist_head *fib_hash_alloc(int bytes)
return kzalloc(bytes, GFP_KERNEL);
else
return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
+ __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(bytes));
}
static void fib_hash_free(struct hlist_head *hash, int bytes)
@@ -806,7 +826,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
goto failure;
} else {
change_nexthops(fi) {
- if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0)
+ err = fib_check_nh(cfg, fi, nexthop_nh);
+ if (err != 0)
goto failure;
} endfor_nexthops(fi)
}
@@ -819,7 +840,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
}
link_it:
- if ((ofi = fib_find_info(fi)) != NULL) {
+ ofi = fib_find_info(fi);
+ if (ofi) {
fi->fib_dead = 1;
free_fib_info(fi);
ofi->fib_treeref++;
@@ -864,7 +886,7 @@ failure:
/* Note! fib_semantic_match intentionally uses RCU list functions. */
int fib_semantic_match(struct list_head *head, const struct flowi *flp,
- struct fib_result *res, int prefixlen)
+ struct fib_result *res, int prefixlen, int fib_flags)
{
struct fib_alias *fa;
int nh_sel = 0;
@@ -879,7 +901,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
if (fa->fa_scope < flp->fl4_scope)
continue;
- fa->fa_state |= FA_S_ACCESSED;
+ fib_alias_accessed(fa);
err = fib_props[fa->fa_type].error;
if (err == 0) {
@@ -895,7 +917,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
case RTN_ANYCAST:
case RTN_MULTICAST:
for_nexthops(fi) {
- if (nh->nh_flags&RTNH_F_DEAD)
+ if (nh->nh_flags & RTNH_F_DEAD)
continue;
if (!flp->oif || flp->oif == nh->nh_oif)
break;
@@ -906,16 +928,15 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
goto out_fill_res;
}
#else
- if (nhsel < 1) {
+ if (nhsel < 1)
goto out_fill_res;
- }
#endif
endfor_nexthops(fi);
continue;
default:
- printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
- fa->fa_type);
+ pr_warning("fib_semantic_match bad type %#x\n",
+ fa->fa_type);
return -EINVAL;
}
}
@@ -929,7 +950,8 @@ out_fill_res:
res->type = fa->fa_type;
res->scope = fa->fa_scope;
res->fi = fa->fa_info;
- atomic_inc(&res->fi->fib_clntref);
+ if (!(fib_flags & FIB_LOOKUP_NOREF))
+ atomic_inc(&res->fi->fib_clntref);
return 0;
}
@@ -1028,10 +1050,10 @@ nla_put_failure:
}
/*
- Update FIB if:
- - local address disappeared -> we must delete all the entries
- referring to it.
- - device went down -> we must shutdown all nexthops going via it.
+ * Update FIB if:
+ * - local address disappeared -> we must delete all the entries
+ * referring to it.
+ * - device went down -> we must shutdown all nexthops going via it.
*/
int fib_sync_down_addr(struct net *net, __be32 local)
{
@@ -1078,7 +1100,7 @@ int fib_sync_down_dev(struct net_device *dev, int force)
prev_fi = fi;
dead = 0;
change_nexthops(fi) {
- if (nexthop_nh->nh_flags&RTNH_F_DEAD)
+ if (nexthop_nh->nh_flags & RTNH_F_DEAD)
dead++;
else if (nexthop_nh->nh_dev == dev &&
nexthop_nh->nh_scope != scope) {
@@ -1110,10 +1132,9 @@ int fib_sync_down_dev(struct net_device *dev, int force)
#ifdef CONFIG_IP_ROUTE_MULTIPATH
/*
- Dead device goes up. We wake up dead nexthops.
- It takes sense only on multipath routes.
+ * Dead device goes up. We wake up dead nexthops.
+ * It takes sense only on multipath routes.
*/
-
int fib_sync_up(struct net_device *dev)
{
struct fib_info *prev_fi;
@@ -1123,7 +1144,7 @@ int fib_sync_up(struct net_device *dev)
struct fib_nh *nh;
int ret;
- if (!(dev->flags&IFF_UP))
+ if (!(dev->flags & IFF_UP))
return 0;
prev_fi = NULL;
@@ -1142,12 +1163,12 @@ int fib_sync_up(struct net_device *dev)
prev_fi = fi;
alive = 0;
change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
+ if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
alive++;
continue;
}
if (nexthop_nh->nh_dev == NULL ||
- !(nexthop_nh->nh_dev->flags&IFF_UP))
+ !(nexthop_nh->nh_dev->flags & IFF_UP))
continue;
if (nexthop_nh->nh_dev != dev ||
!__in_dev_get_rtnl(dev))
@@ -1169,10 +1190,9 @@ int fib_sync_up(struct net_device *dev)
}
/*
- The algorithm is suboptimal, but it provides really
- fair weighted route distribution.
+ * The algorithm is suboptimal, but it provides really
+ * fair weighted route distribution.
*/
-
void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
{
struct fib_info *fi = res->fi;
@@ -1182,7 +1202,7 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
if (fi->fib_power <= 0) {
int power = 0;
change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
+ if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
power += nexthop_nh->nh_weight;
nexthop_nh->nh_power = nexthop_nh->nh_weight;
}
@@ -1198,15 +1218,16 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
/* w should be random number [0..fi->fib_power-1],
- it is pretty bad approximation.
+ * it is pretty bad approximation.
*/
w = jiffies % fi->fib_power;
change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) &&
+ if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
nexthop_nh->nh_power) {
- if ((w -= nexthop_nh->nh_power) <= 0) {
+ w -= nexthop_nh->nh_power;
+ if (w <= 0) {
nexthop_nh->nh_power--;
fi->fib_power--;
res->nh_sel = nhsel;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 2230ae3bf20..b1445089510 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -186,7 +186,7 @@ static inline struct tnode *node_parent_rcu(struct node *node)
{
struct tnode *ret = node_parent(node);
- return rcu_dereference(ret);
+ return rcu_dereference_rtnl(ret);
}
/* Same as rcu_assign_pointer
@@ -209,9 +209,7 @@ static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
{
struct node *ret = tnode_get_child(tn, i);
- return rcu_dereference_check(ret,
- rcu_read_lock_held() ||
- lockdep_rtnl_is_held());
+ return rcu_dereference_rtnl(ret);
}
static inline int tnode_child_length(const struct tnode *tn)
@@ -457,8 +455,8 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
tn->empty_children = 1<<bits;
}
- pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
- (unsigned long) (sizeof(struct node) << bits));
+ pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
+ sizeof(struct node) << bits);
return tn;
}
@@ -607,11 +605,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
/* Keep root node larger */
- if (!node_parent((struct node*) tn)) {
+ if (!node_parent((struct node *)tn)) {
inflate_threshold_use = inflate_threshold_root;
halve_threshold_use = halve_threshold_root;
- }
- else {
+ } else {
inflate_threshold_use = inflate_threshold;
halve_threshold_use = halve_threshold;
}
@@ -637,7 +634,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
check_tnode(tn);
/* Return if at least one inflate is run */
- if( max_work != MAX_WORK)
+ if (max_work != MAX_WORK)
return (struct node *) tn;
/*
@@ -964,9 +961,7 @@ fib_find_node(struct trie *t, u32 key)
struct node *n;
pos = 0;
- n = rcu_dereference_check(t->trie,
- rcu_read_lock_held() ||
- lockdep_rtnl_is_held());
+ n = rcu_dereference_rtnl(t->trie);
while (n != NULL && NODE_TYPE(n) == T_TNODE) {
tn = (struct tnode *) n;
@@ -1347,7 +1342,7 @@ err:
/* should be called with rcu_read_lock */
static int check_leaf(struct trie *t, struct leaf *l,
t_key key, const struct flowi *flp,
- struct fib_result *res)
+ struct fib_result *res, int fib_flags)
{
struct leaf_info *li;
struct hlist_head *hhead = &l->list;
@@ -1361,7 +1356,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
if (l->key != (key & ntohl(mask)))
continue;
- err = fib_semantic_match(&li->falh, flp, res, plen);
+ err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags);
#ifdef CONFIG_IP_FIB_TRIE_STATS
if (err <= 0)
@@ -1377,7 +1372,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
}
int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
- struct fib_result *res)
+ struct fib_result *res, int fib_flags)
{
struct trie *t = (struct trie *) tb->tb_data;
int ret;
@@ -1389,8 +1384,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
t_key cindex = 0;
int current_prefix_length = KEYLENGTH;
struct tnode *cn;
- t_key node_prefix, key_prefix, pref_mismatch;
- int mp;
+ t_key pref_mismatch;
rcu_read_lock();
@@ -1404,7 +1398,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
/* Just a leaf? */
if (IS_LEAF(n)) {
- ret = check_leaf(t, (struct leaf *)n, key, flp, res);
+ ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
goto found;
}
@@ -1429,7 +1423,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
}
if (IS_LEAF(n)) {
- ret = check_leaf(t, (struct leaf *)n, key, flp, res);
+ ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
if (ret > 0)
goto backtrace;
goto found;
@@ -1505,10 +1499,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
* matching prefix.
*/
- node_prefix = mask_pfx(cn->key, cn->pos);
- key_prefix = mask_pfx(key, cn->pos);
- pref_mismatch = key_prefix^node_prefix;
- mp = 0;
+ pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
/*
* In short: If skipped bits in this node do not match
@@ -1516,13 +1507,9 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
* state.directly.
*/
if (pref_mismatch) {
- while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
- mp++;
- pref_mismatch = pref_mismatch << 1;
- }
- key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
+ int mp = KEYLENGTH - fls(pref_mismatch);
- if (key_prefix != 0)
+ if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
goto backtrace;
if (current_prefix_length >= cn->pos)
@@ -1746,14 +1733,14 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
/* Node empty, walk back up to parent */
c = (struct node *) p;
- } while ( (p = node_parent_rcu(c)) != NULL);
+ } while ((p = node_parent_rcu(c)) != NULL);
return NULL; /* Root of trie */
}
static struct leaf *trie_firstleaf(struct trie *t)
{
- struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
+ struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
if (!n)
return NULL;
@@ -1851,7 +1838,8 @@ void fib_table_select_default(struct fib_table *tb,
if (!next_fi->fib_nh[0].nh_gw ||
next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
continue;
- fa->fa_state |= FA_S_ACCESSED;
+
+ fib_alias_accessed(fa);
if (fi == NULL) {
if (next_fi != res->fi)
@@ -2039,14 +2027,14 @@ struct fib_trie_iter {
struct seq_net_private p;
struct fib_table *tb;
struct tnode *tnode;
- unsigned index;
- unsigned depth;
+ unsigned int index;
+ unsigned int depth;
};
static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
{
struct tnode *tn = iter->tnode;
- unsigned cindex = iter->index;
+ unsigned int cindex = iter->index;
struct tnode *p;
/* A single entry routing table */
@@ -2155,7 +2143,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
*/
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
{
- unsigned i, max, pointers, bytes, avdepth;
+ unsigned int i, max, pointers, bytes, avdepth;
if (stat->leaves)
avdepth = stat->totdepth*100 / stat->leaves;
@@ -2352,7 +2340,8 @@ static void fib_trie_seq_stop(struct seq_file *seq, void *v)
static void seq_indent(struct seq_file *seq, int n)
{
- while (n-- > 0) seq_puts(seq, " ");
+ while (n-- > 0)
+ seq_puts(seq, " ");
}
static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
@@ -2384,7 +2373,7 @@ static const char *const rtn_type_names[__RTN_MAX] = {
[RTN_XRESOLVE] = "XRESOLVE",
};
-static inline const char *rtn_type(char *buf, size_t len, unsigned t)
+static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
{
if (t < __RTN_MAX && rtn_type_names[t])
return rtn_type_names[t];
@@ -2540,13 +2529,12 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v)
rcu_read_unlock();
}
-static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
+static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
{
- static unsigned type2flags[RTN_MAX + 1] = {
- [7] = RTF_REJECT, [8] = RTF_REJECT,
- };
- unsigned flags = type2flags[type];
+ unsigned int flags = 0;
+ if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
+ flags = RTF_REJECT;
if (fi && fi->fib_nh->nh_gw)
flags |= RTF_GATEWAY;
if (mask == htonl(0xFFFFFFFF))
@@ -2558,7 +2546,7 @@ static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
/*
* This outputs /proc/net/route.
* The format of the file is not supposed to be changed
- * and needs to be same as fib_hash output to avoid breaking
+ * and needs to be same as fib_hash output to avoid breaking
* legacy utilities
*/
static int fib_route_seq_show(struct seq_file *seq, void *v)
@@ -2583,7 +2571,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
list_for_each_entry_rcu(fa, &li->falh, fa_list) {
const struct fib_info *fi = fa->fa_info;
- unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
+ unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
int len;
if (fa->fa_type == RTN_BROADCAST
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
new file mode 100644
index 00000000000..caea6885fdb
--- /dev/null
+++ b/net/ipv4/gre.c
@@ -0,0 +1,151 @@
+/*
+ * GRE over IPv4 demultiplexer driver
+ *
+ * Authors: Dmitry Kozlov (xeb@mail.ru)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+
+
+static const struct gre_protocol *gre_proto[GREPROTO_MAX] __read_mostly;
+static DEFINE_SPINLOCK(gre_proto_lock);
+
+int gre_add_protocol(const struct gre_protocol *proto, u8 version)
+{
+ if (version >= GREPROTO_MAX)
+ goto err_out;
+
+ spin_lock(&gre_proto_lock);
+ if (gre_proto[version])
+ goto err_out_unlock;
+
+ rcu_assign_pointer(gre_proto[version], proto);
+ spin_unlock(&gre_proto_lock);
+ return 0;
+
+err_out_unlock:
+ spin_unlock(&gre_proto_lock);
+err_out:
+ return -1;
+}
+EXPORT_SYMBOL_GPL(gre_add_protocol);
+
+int gre_del_protocol(const struct gre_protocol *proto, u8 version)
+{
+ if (version >= GREPROTO_MAX)
+ goto err_out;
+
+ spin_lock(&gre_proto_lock);
+ if (gre_proto[version] != proto)
+ goto err_out_unlock;
+ rcu_assign_pointer(gre_proto[version], NULL);
+ spin_unlock(&gre_proto_lock);
+ synchronize_rcu();
+ return 0;
+
+err_out_unlock:
+ spin_unlock(&gre_proto_lock);
+err_out:
+ return -1;
+}
+EXPORT_SYMBOL_GPL(gre_del_protocol);
+
+static int gre_rcv(struct sk_buff *skb)
+{
+ const struct gre_protocol *proto;
+ u8 ver;
+ int ret;
+
+ if (!pskb_may_pull(skb, 12))
+ goto drop;
+
+ ver = skb->data[1]&0x7f;
+ if (ver >= GREPROTO_MAX)
+ goto drop;
+
+ rcu_read_lock();
+ proto = rcu_dereference(gre_proto[ver]);
+ if (!proto || !proto->handler)
+ goto drop_unlock;
+ ret = proto->handler(skb);
+ rcu_read_unlock();
+ return ret;
+
+drop_unlock:
+ rcu_read_unlock();
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static void gre_err(struct sk_buff *skb, u32 info)
+{
+ const struct gre_protocol *proto;
+ u8 ver;
+
+ if (!pskb_may_pull(skb, 12))
+ goto drop;
+
+ ver = skb->data[1]&0x7f;
+ if (ver >= GREPROTO_MAX)
+ goto drop;
+
+ rcu_read_lock();
+ proto = rcu_dereference(gre_proto[ver]);
+ if (!proto || !proto->err_handler)
+ goto drop_unlock;
+ proto->err_handler(skb, info);
+ rcu_read_unlock();
+ return;
+
+drop_unlock:
+ rcu_read_unlock();
+drop:
+ kfree_skb(skb);
+}
+
+static const struct net_protocol net_gre_protocol = {
+ .handler = gre_rcv,
+ .err_handler = gre_err,
+ .netns_ok = 1,
+};
+
+static int __init gre_init(void)
+{
+ pr_info("GRE over IPv4 demultiplexor driver");
+
+ if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
+ pr_err("gre: can't add protocol\n");
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+static void __exit gre_exit(void)
+{
+ inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+}
+
+module_init(gre_init);
+module_exit(gre_exit);
+
+MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
+MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_LICENSE("GPL");
+
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index a0d847c7cba..96bc7f9475a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -379,7 +379,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
inet->tos = ip_hdr(skb)->tos;
daddr = ipc.addr = rt->rt_src;
ipc.opt = NULL;
- ipc.shtx.flags = 0;
+ ipc.tx_flags = 0;
if (icmp_param->replyopts.optlen) {
ipc.opt = &icmp_param->replyopts;
if (ipc.opt->srr)
@@ -538,7 +538,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
inet_sk(sk)->tos = tos;
ipc.addr = iph->saddr;
ipc.opt = &icmp_param.replyopts;
- ipc.shtx.flags = 0;
+ ipc.tx_flags = 0;
{
struct flowi fl = {
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index a1ad0e7180d..c8877c6c721 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -856,6 +856,18 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
igmpv3_clear_delrec(in_dev);
} else if (len < 12) {
return; /* ignore bogus packet; freed by caller */
+ } else if (IGMP_V1_SEEN(in_dev)) {
+ /* This is a v3 query with v1 queriers present */
+ max_delay = IGMP_Query_Response_Interval;
+ group = 0;
+ } else if (IGMP_V2_SEEN(in_dev)) {
+ /* this is a v3 query with v2 queriers present;
+ * Interpretation of the max_delay code is problematic here.
+ * A real v2 host would use ih_code directly, while v3 has a
+ * different encoding. We use the v3 encoding as more likely
+ * to be intended in a v3 query.
+ */
+ max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
} else { /* v3 */
if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
return;
@@ -1257,14 +1269,14 @@ void ip_mc_rejoin_group(struct ip_mc_list *im)
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
- if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
- igmp_mod_timer(im, IGMP_Initial_Report_Delay);
- return;
- }
- /* else, v3 */
- im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
- IGMP_Unsolicited_Report_Count;
- igmp_ifc_event(in_dev);
+ /* a failover is happening and switches
+ * must be notified immediately */
+ if (IGMP_V1_SEEN(in_dev))
+ igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
+ else if (IGMP_V2_SEEN(in_dev))
+ igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
+ else
+ igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
#endif
}
EXPORT_SYMBOL(ip_mc_rejoin_group);
@@ -1406,6 +1418,7 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
write_unlock_bh(&in_dev->mc_list_lock);
}
+/* RTNL is locked */
static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
{
struct flowi fl = { .nl_u = { .ip4_u =
@@ -1416,15 +1429,12 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
if (imr->imr_ifindex) {
idev = inetdev_by_index(net, imr->imr_ifindex);
- if (idev)
- __in_dev_put(idev);
return idev;
}
if (imr->imr_address.s_addr) {
- dev = ip_dev_find(net, imr->imr_address.s_addr);
+ dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
if (!dev)
return NULL;
- dev_put(dev);
}
if (!dev && !ip_route_output_key(net, &rt, &fl)) {
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e5fa2ddce32..ba804266584 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -425,7 +425,7 @@ static int inet_diag_bc_run(const void *bc, int len,
bc += op->no;
}
}
- return (len == 0);
+ return len == 0;
}
static int valid_cc(const void *bc, int len, int cc)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fb7ad5a21ff..1b344f30b46 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -101,19 +101,43 @@ void inet_put_port(struct sock *sk)
}
EXPORT_SYMBOL(inet_put_port);
-void __inet_inherit_port(struct sock *sk, struct sock *child)
+int __inet_inherit_port(struct sock *sk, struct sock *child)
{
struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
- const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num,
+ unsigned short port = inet_sk(child)->inet_num;
+ const int bhash = inet_bhashfn(sock_net(sk), port,
table->bhash_size);
struct inet_bind_hashbucket *head = &table->bhash[bhash];
struct inet_bind_bucket *tb;
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
+ if (tb->port != port) {
+ /* NOTE: using tproxy and redirecting skbs to a proxy
+ * on a different listener port breaks the assumption
+ * that the listener socket's icsk_bind_hash is the same
+ * as that of the child socket. We have to look up or
+ * create a new bind bucket for the child here. */
+ struct hlist_node *node;
+ inet_bind_bucket_for_each(tb, node, &head->chain) {
+ if (net_eq(ib_net(tb), sock_net(sk)) &&
+ tb->port == port)
+ break;
+ }
+ if (!node) {
+ tb = inet_bind_bucket_create(table->bind_bucket_cachep,
+ sock_net(sk), head, port);
+ if (!tb) {
+ spin_unlock(&head->lock);
+ return -ENOMEM;
+ }
+ }
+ }
sk_add_bind_node(child, &tb->owners);
inet_csk(child)->icsk_bind_hash = tb;
spin_unlock(&head->lock);
+
+ return 0;
}
EXPORT_SYMBOL_GPL(__inet_inherit_port);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b7c41654dde..168440834ad 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -116,11 +116,11 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a)
struct ip4_create_arg *arg = a;
qp = container_of(q, struct ipq, q);
- return (qp->id == arg->iph->id &&
+ return qp->id == arg->iph->id &&
qp->saddr == arg->iph->saddr &&
qp->daddr == arg->iph->daddr &&
qp->protocol == arg->iph->protocol &&
- qp->user == arg->user);
+ qp->user == arg->user;
}
/* Memory Tracking Functions. */
@@ -542,7 +542,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
/* If the first fragment is fragmented itself, we split
* it to two chunks: the first with data and paged part
* and the second, holding only fragments. */
- if (skb_has_frags(head)) {
+ if (skb_has_frag_list(head)) {
struct sk_buff *clone;
int i, plen = 0;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 945b20a5ad5..d0ffcbe369b 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -44,8 +44,9 @@
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
+#include <net/gre.h>
-#ifdef CONFIG_IPV6
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
@@ -63,13 +64,13 @@
We cannot track such dead loops during route installation,
it is infeasible task. The most general solutions would be
to keep skb->encapsulation counter (sort of local ttl),
- and silently drop packet when it expires. It is the best
+ and silently drop packet when it expires. It is a good
solution, but it supposes maintaing new variable in ALL
skb, even if no tunneling is used.
- Current solution: HARD_TX_LOCK lock breaks dead loops.
-
-
+ Current solution: xmit_recursion breaks dead loops. This is a percpu
+ counter, since when we enter the first ndo_xmit(), cpu migration is
+ forbidden. We force an exit if this counter reaches RECURSION_LIMIT
2. Networking dead loops would not kill routers, but would really
kill network. IP hop limit plays role of "t->recursion" in this case,
@@ -128,7 +129,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev);
static int ipgre_net_id __read_mostly;
struct ipgre_net {
- struct ip_tunnel *tunnels[4][HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
struct net_device *fb_tunnel_dev;
};
@@ -158,13 +159,40 @@ struct ipgre_net {
#define tunnels_l tunnels[1]
#define tunnels_wc tunnels[0]
/*
- * Locking : hash tables are protected by RCU and a spinlock
+ * Locking : hash tables are protected by RCU and RTNL
*/
-static DEFINE_SPINLOCK(ipgre_lock);
#define for_each_ip_tunnel_rcu(start) \
for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+ unsigned long rx_packets;
+ unsigned long rx_bytes;
+ unsigned long tx_packets;
+ unsigned long tx_bytes;
+};
+
+static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
+{
+ struct pcpu_tstats sum = { 0 };
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+
+ sum.rx_packets += tstats->rx_packets;
+ sum.rx_bytes += tstats->rx_bytes;
+ sum.tx_packets += tstats->tx_packets;
+ sum.tx_bytes += tstats->tx_bytes;
+ }
+ dev->stats.rx_packets = sum.rx_packets;
+ dev->stats.rx_bytes = sum.rx_bytes;
+ dev->stats.tx_packets = sum.tx_packets;
+ dev->stats.tx_bytes = sum.tx_bytes;
+ return &dev->stats;
+}
+
/* Given src, dst and key, find appropriate for input tunnel. */
static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
@@ -173,8 +201,8 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
{
struct net *net = dev_net(dev);
int link = dev->ifindex;
- unsigned h0 = HASH(remote);
- unsigned h1 = HASH(key);
+ unsigned int h0 = HASH(remote);
+ unsigned int h1 = HASH(key);
struct ip_tunnel *t, *cand = NULL;
struct ipgre_net *ign = net_generic(net, ipgre_net_id);
int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
@@ -289,13 +317,13 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
return NULL;
}
-static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
+static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
struct ip_tunnel_parm *parms)
{
__be32 remote = parms->iph.daddr;
__be32 local = parms->iph.saddr;
__be32 key = parms->i_key;
- unsigned h = HASH(key);
+ unsigned int h = HASH(key);
int prio = 0;
if (local)
@@ -308,7 +336,7 @@ static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
return &ign->tunnels[prio][h];
}
-static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
+static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
struct ip_tunnel *t)
{
return __ipgre_bucket(ign, &t->parms);
@@ -316,23 +344,22 @@ static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
{
- struct ip_tunnel **tp = ipgre_bucket(ign, t);
+ struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
- spin_lock_bh(&ipgre_lock);
- t->next = *tp;
+ rcu_assign_pointer(t->next, rtnl_dereference(*tp));
rcu_assign_pointer(*tp, t);
- spin_unlock_bh(&ipgre_lock);
}
static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
{
- struct ip_tunnel **tp;
-
- for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
- if (t == *tp) {
- spin_lock_bh(&ipgre_lock);
- *tp = t->next;
- spin_unlock_bh(&ipgre_lock);
+ struct ip_tunnel __rcu **tp;
+ struct ip_tunnel *iter;
+
+ for (tp = ipgre_bucket(ign, t);
+ (iter = rtnl_dereference(*tp)) != NULL;
+ tp = &iter->next) {
+ if (t == iter) {
+ rcu_assign_pointer(*tp, t->next);
break;
}
}
@@ -346,10 +373,13 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
__be32 local = parms->iph.saddr;
__be32 key = parms->i_key;
int link = parms->link;
- struct ip_tunnel *t, **tp;
+ struct ip_tunnel *t;
+ struct ip_tunnel __rcu **tp;
struct ipgre_net *ign = net_generic(net, ipgre_net_id);
- for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
+ for (tp = __ipgre_bucket(ign, parms);
+ (t = rtnl_dereference(*tp)) != NULL;
+ tp = &t->next)
if (local == t->parms.iph.saddr &&
remote == t->parms.iph.daddr &&
key == t->parms.i_key &&
@@ -360,7 +390,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
return t;
}
-static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
+static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
struct ip_tunnel_parm *parms, int create)
{
struct ip_tunnel *t, *nt;
@@ -582,7 +612,7 @@ static int ipgre_rcv(struct sk_buff *skb)
if ((tunnel = ipgre_tunnel_lookup(skb->dev,
iph->saddr, iph->daddr, key,
gre_proto))) {
- struct net_device_stats *stats = &tunnel->dev->stats;
+ struct pcpu_tstats *tstats;
secpath_reset(skb);
@@ -606,22 +636,22 @@ static int ipgre_rcv(struct sk_buff *skb)
/* Looped back packet, drop it! */
if (skb_rtable(skb)->fl.iif == 0)
goto drop;
- stats->multicast++;
+ tunnel->dev->stats.multicast++;
skb->pkt_type = PACKET_BROADCAST;
}
#endif
if (((flags&GRE_CSUM) && csum) ||
(!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
- stats->rx_crc_errors++;
- stats->rx_errors++;
+ tunnel->dev->stats.rx_crc_errors++;
+ tunnel->dev->stats.rx_errors++;
goto drop;
}
if (tunnel->parms.i_flags&GRE_SEQ) {
if (!(flags&GRE_SEQ) ||
(tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
- stats->rx_fifo_errors++;
- stats->rx_errors++;
+ tunnel->dev->stats.rx_fifo_errors++;
+ tunnel->dev->stats.rx_errors++;
goto drop;
}
tunnel->i_seqno = seqno + 1;
@@ -630,8 +660,8 @@ static int ipgre_rcv(struct sk_buff *skb)
/* Warning: All skb pointers will be invalidated! */
if (tunnel->dev->type == ARPHRD_ETHER) {
if (!pskb_may_pull(skb, ETH_HLEN)) {
- stats->rx_length_errors++;
- stats->rx_errors++;
+ tunnel->dev->stats.rx_length_errors++;
+ tunnel->dev->stats.rx_errors++;
goto drop;
}
@@ -640,14 +670,19 @@ static int ipgre_rcv(struct sk_buff *skb)
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
}
- skb_tunnel_rx(skb, tunnel->dev);
+ tstats = this_cpu_ptr(tunnel->dev->tstats);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+
+ __skb_tunnel_rx(skb, tunnel->dev);
skb_reset_network_header(skb);
ipgre_ecn_decapsulate(iph, skb);
netif_rx(skb);
+
rcu_read_unlock();
- return(0);
+ return 0;
}
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
@@ -655,20 +690,19 @@ drop:
rcu_read_unlock();
drop_nolock:
kfree_skb(skb);
- return(0);
+ return 0;
}
static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct net_device_stats *stats = &dev->stats;
- struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
+ struct pcpu_tstats *tstats;
struct iphdr *old_iph = ip_hdr(skb);
struct iphdr *tiph;
u8 tos;
__be16 df;
struct rtable *rt; /* Route to the other host */
- struct net_device *tdev; /* Device to other host */
+ struct net_device *tdev; /* Device to other host */
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int gre_hlen;
@@ -690,7 +724,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
/* NBMA tunnel */
if (skb_dst(skb) == NULL) {
- stats->tx_fifo_errors++;
+ dev->stats.tx_fifo_errors++;
goto tx_error;
}
@@ -699,7 +733,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
if ((dst = rt->rt_gateway) == 0)
goto tx_error_icmp;
}
-#ifdef CONFIG_IPV6
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
else if (skb->protocol == htons(ETH_P_IPV6)) {
struct in6_addr *addr6;
int addr_type;
@@ -736,14 +770,20 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
}
{
- struct flowi fl = { .oif = tunnel->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = dst,
- .saddr = tiph->saddr,
- .tos = RT_TOS(tos) } },
- .proto = IPPROTO_GRE };
+ struct flowi fl = {
+ .oif = tunnel->parms.link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = dst,
+ .saddr = tiph->saddr,
+ .tos = RT_TOS(tos)
+ }
+ },
+ .proto = IPPROTO_GRE
+ }
+;
if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
- stats->tx_carrier_errors++;
+ dev->stats.tx_carrier_errors++;
goto tx_error;
}
}
@@ -751,7 +791,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
if (tdev == dev) {
ip_rt_put(rt);
- stats->collisions++;
+ dev->stats.collisions++;
goto tx_error;
}
@@ -774,7 +814,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
goto tx_error;
}
}
-#ifdef CONFIG_IPV6
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
else if (skb->protocol == htons(ETH_P_IPV6)) {
struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
@@ -814,7 +854,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
dev->needed_headroom = max_headroom;
if (!new_skb) {
ip_rt_put(rt);
- txq->tx_dropped++;
+ dev->stats.tx_dropped++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -850,7 +890,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
if ((iph->ttl = tiph->ttl) == 0) {
if (skb->protocol == htons(ETH_P_IP))
iph->ttl = old_iph->ttl;
-#ifdef CONFIG_IPV6
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
else if (skb->protocol == htons(ETH_P_IPV6))
iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
#endif
@@ -881,15 +921,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
}
nf_reset(skb);
-
- IPTUNNEL_XMIT();
+ tstats = this_cpu_ptr(dev->tstats);
+ __IPTUNNEL_XMIT(tstats, &dev->stats);
return NETDEV_TX_OK;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
- stats->tx_errors++;
+ dev->stats.tx_errors++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -909,13 +949,19 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
/* Guess output device to choose reasonable mtu and needed_headroom */
if (iph->daddr) {
- struct flowi fl = { .oif = tunnel->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = iph->daddr,
- .saddr = iph->saddr,
- .tos = RT_TOS(iph->tos) } },
- .proto = IPPROTO_GRE };
+ struct flowi fl = {
+ .oif = tunnel->parms.link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .tos = RT_TOS(iph->tos)
+ }
+ },
+ .proto = IPPROTO_GRE
+ };
struct rtable *rt;
+
if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
tdev = rt->dst.dev;
ip_rt_put(rt);
@@ -1012,7 +1058,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
break;
}
} else {
- unsigned nflags = 0;
+ unsigned int nflags = 0;
t = netdev_priv(dev);
@@ -1125,7 +1171,7 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type,
- const void *daddr, const void *saddr, unsigned len)
+ const void *daddr, const void *saddr, unsigned int len)
{
struct ip_tunnel *t = netdev_priv(dev);
struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
@@ -1167,13 +1213,19 @@ static int ipgre_open(struct net_device *dev)
struct ip_tunnel *t = netdev_priv(dev);
if (ipv4_is_multicast(t->parms.iph.daddr)) {
- struct flowi fl = { .oif = t->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = t->parms.iph.daddr,
- .saddr = t->parms.iph.saddr,
- .tos = RT_TOS(t->parms.iph.tos) } },
- .proto = IPPROTO_GRE };
+ struct flowi fl = {
+ .oif = t->parms.link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = t->parms.iph.daddr,
+ .saddr = t->parms.iph.saddr,
+ .tos = RT_TOS(t->parms.iph.tos)
+ }
+ },
+ .proto = IPPROTO_GRE
+ };
struct rtable *rt;
+
if (ip_route_output_key(dev_net(dev), &rt, &fl))
return -EADDRNOTAVAIL;
dev = rt->dst.dev;
@@ -1193,10 +1245,8 @@ static int ipgre_close(struct net_device *dev)
if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
struct in_device *in_dev;
in_dev = inetdev_by_index(dev_net(dev), t->mlink);
- if (in_dev) {
+ if (in_dev)
ip_mc_dec_group(in_dev, t->parms.iph.daddr);
- in_dev_put(in_dev);
- }
}
return 0;
}
@@ -1213,12 +1263,19 @@ static const struct net_device_ops ipgre_netdev_ops = {
.ndo_start_xmit = ipgre_tunnel_xmit,
.ndo_do_ioctl = ipgre_tunnel_ioctl,
.ndo_change_mtu = ipgre_tunnel_change_mtu,
+ .ndo_get_stats = ipgre_get_stats,
};
+static void ipgre_dev_free(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
static void ipgre_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ipgre_netdev_ops;
- dev->destructor = free_netdev;
+ dev->destructor = ipgre_dev_free;
dev->type = ARPHRD_IPGRE;
dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
@@ -1256,6 +1313,10 @@ static int ipgre_tunnel_init(struct net_device *dev)
} else
dev->header_ops = &ipgre_header_ops;
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
return 0;
}
@@ -1274,14 +1335,13 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
tunnel->hlen = sizeof(struct iphdr) + 4;
dev_hold(dev);
- ign->tunnels_wc[0] = tunnel;
+ rcu_assign_pointer(ign->tunnels_wc[0], tunnel);
}
-static const struct net_protocol ipgre_protocol = {
- .handler = ipgre_rcv,
- .err_handler = ipgre_err,
- .netns_ok = 1,
+static const struct gre_protocol ipgre_protocol = {
+ .handler = ipgre_rcv,
+ .err_handler = ipgre_err,
};
static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
@@ -1291,11 +1351,13 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
for (prio = 0; prio < 4; prio++) {
int h;
for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t = ign->tunnels[prio][h];
+ struct ip_tunnel *t;
+
+ t = rtnl_dereference(ign->tunnels[prio][h]);
while (t != NULL) {
unregister_netdevice_queue(t->dev, head);
- t = t->next;
+ t = rtnl_dereference(t->next);
}
}
}
@@ -1441,6 +1503,10 @@ static int ipgre_tap_init(struct net_device *dev)
ipgre_tunnel_bind_dev(dev);
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
return 0;
}
@@ -1451,6 +1517,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = ipgre_tunnel_change_mtu,
+ .ndo_get_stats = ipgre_get_stats,
};
static void ipgre_tap_setup(struct net_device *dev)
@@ -1459,7 +1526,7 @@ static void ipgre_tap_setup(struct net_device *dev)
ether_setup(dev);
dev->netdev_ops = &ipgre_tap_netdev_ops;
- dev->destructor = free_netdev;
+ dev->destructor = ipgre_dev_free;
dev->iflink = 0;
dev->features |= NETIF_F_NETNS_LOCAL;
@@ -1487,6 +1554,10 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla
if (!tb[IFLA_MTU])
dev->mtu = mtu;
+ /* Can use a lockless transmit, unless we generate output sequences */
+ if (!(nt->parms.o_flags & GRE_SEQ))
+ dev->features |= NETIF_F_LLTX;
+
err = register_netdevice(dev);
if (err)
goto out;
@@ -1522,7 +1593,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
t = nt;
if (dev->type != ARPHRD_ETHER) {
- unsigned nflags = 0;
+ unsigned int nflags = 0;
if (ipv4_is_multicast(p.iph.daddr))
nflags = IFF_BROADCAST;
@@ -1663,7 +1734,7 @@ static int __init ipgre_init(void)
if (err < 0)
return err;
- err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE);
+ err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
if (err < 0) {
printk(KERN_INFO "ipgre init: can't add protocol\n");
goto add_proto_failed;
@@ -1683,7 +1754,7 @@ out:
tap_ops_failed:
rtnl_link_unregister(&ipgre_link_ops);
rtnl_link_failed:
- inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
+ gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
add_proto_failed:
unregister_pernet_device(&ipgre_net_ops);
goto out;
@@ -1693,7 +1764,7 @@ static void __exit ipgre_fini(void)
{
rtnl_link_unregister(&ipgre_tap_ops);
rtnl_link_unregister(&ipgre_link_ops);
- if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
+ if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
printk(KERN_INFO "ipgre close: can't remove protocol\n");
unregister_pernet_device(&ipgre_net_ops);
}
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index ba9836c488e..1906fa35860 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -466,7 +466,7 @@ error:
}
return -EINVAL;
}
-
+EXPORT_SYMBOL(ip_options_compile);
/*
* Undo all the changes done by ip_options_compile().
@@ -646,3 +646,4 @@ int ip_options_rcv_srr(struct sk_buff *skb)
}
return 0;
}
+EXPORT_SYMBOL(ip_options_rcv_srr);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 04b69896df5..439d2a34ee4 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -487,10 +487,9 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
* LATER: this step can be merged to real generation of fragments,
* we can switch to copy when see the first bad fragment.
*/
- if (skb_has_frags(skb)) {
- struct sk_buff *frag;
+ if (skb_has_frag_list(skb)) {
+ struct sk_buff *frag, *frag2;
int first_len = skb_pagelen(skb);
- int truesizes = 0;
if (first_len - hlen > mtu ||
((first_len - hlen) & 7) ||
@@ -503,18 +502,18 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
if (frag->len > mtu ||
((frag->len & 7) && frag->next) ||
skb_headroom(frag) < hlen)
- goto slow_path;
+ goto slow_path_clean;
/* Partially cloned skb? */
if (skb_shared(frag))
- goto slow_path;
+ goto slow_path_clean;
BUG_ON(frag->sk);
if (skb->sk) {
frag->sk = skb->sk;
frag->destructor = sock_wfree;
}
- truesizes += frag->truesize;
+ skb->truesize -= frag->truesize;
}
/* Everything is OK. Generate! */
@@ -524,7 +523,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
frag = skb_shinfo(skb)->frag_list;
skb_frag_list_init(skb);
skb->data_len = first_len - skb_headlen(skb);
- skb->truesize -= truesizes;
skb->len = first_len;
iph->tot_len = htons(first_len);
iph->frag_off = htons(IP_MF);
@@ -576,6 +574,15 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
}
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
return err;
+
+slow_path_clean:
+ skb_walk_frags(skb, frag2) {
+ if (frag2 == frag)
+ break;
+ frag2->sk = NULL;
+ frag2->destructor = NULL;
+ skb->truesize += frag2->truesize;
+ }
}
slow_path:
@@ -837,10 +844,9 @@ int ip_append_data(struct sock *sk,
inet->cork.length = 0;
sk->sk_sndmsg_page = NULL;
sk->sk_sndmsg_off = 0;
- if ((exthdrlen = rt->dst.header_len) != 0) {
- length += exthdrlen;
- transhdrlen += exthdrlen;
- }
+ exthdrlen = rt->dst.header_len;
+ length += exthdrlen;
+ transhdrlen += exthdrlen;
} else {
rt = (struct rtable *)inet->cork.dst;
if (inet->cork.flags & IPCORK_OPT)
@@ -927,16 +933,19 @@ alloc_new_skb:
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
else
- alloclen = datalen + fragheaderlen;
+ alloclen = fraglen;
/* The last fragment gets additional space at tail.
* Note, with MSG_MORE we overallocate on fragments,
* because we have no idea what fragment will be
* the last.
*/
- if (datalen == length + fraggap)
+ if (datalen == length + fraggap) {
alloclen += rt->dst.trailer_len;
-
+ /* make sure mtu is not reached */
+ if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
+ datalen -= ALIGN(rt->dst.trailer_len, 8);
+ }
if (transhdrlen) {
skb = sock_alloc_send_skb(sk,
alloclen + hh_len + 15,
@@ -953,7 +962,7 @@ alloc_new_skb:
else
/* only the initial fragment is
time stamped */
- ipc->shtx.flags = 0;
+ ipc->tx_flags = 0;
}
if (skb == NULL)
goto error;
@@ -964,7 +973,7 @@ alloc_new_skb:
skb->ip_summed = csummode;
skb->csum = 0;
skb_reserve(skb, hh_len);
- *skb_tx(skb) = ipc->shtx;
+ skb_shinfo(skb)->tx_flags = ipc->tx_flags;
/*
* Find where to start putting bytes.
@@ -1384,7 +1393,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
daddr = ipc.addr = rt->rt_src;
ipc.opt = NULL;
- ipc.shtx.flags = 0;
+ ipc.tx_flags = 0;
if (replyopts.opt.optlen) {
ipc.opt = &replyopts.opt;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 6c40a8c46e7..64b70ad162e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1129,6 +1129,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_HDRINCL:
val = inet->hdrincl;
break;
+ case IP_NODEFRAG:
+ val = inet->nodefrag;
+ break;
case IP_MTU_DISCOVER:
val = inet->pmtudisc;
break;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index ec036731a70..e9b816e6cd7 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -122,31 +122,59 @@
static int ipip_net_id __read_mostly;
struct ipip_net {
- struct ip_tunnel *tunnels_r_l[HASH_SIZE];
- struct ip_tunnel *tunnels_r[HASH_SIZE];
- struct ip_tunnel *tunnels_l[HASH_SIZE];
- struct ip_tunnel *tunnels_wc[1];
- struct ip_tunnel **tunnels[4];
+ struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_wc[1];
+ struct ip_tunnel __rcu **tunnels[4];
struct net_device *fb_tunnel_dev;
};
-static void ipip_tunnel_init(struct net_device *dev);
+static int ipip_tunnel_init(struct net_device *dev);
static void ipip_tunnel_setup(struct net_device *dev);
+static void ipip_dev_free(struct net_device *dev);
/*
- * Locking : hash tables are protected by RCU and a spinlock
+ * Locking : hash tables are protected by RCU and RTNL
*/
-static DEFINE_SPINLOCK(ipip_lock);
#define for_each_ip_tunnel_rcu(start) \
for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+ unsigned long rx_packets;
+ unsigned long rx_bytes;
+ unsigned long tx_packets;
+ unsigned long tx_bytes;
+};
+
+static struct net_device_stats *ipip_get_stats(struct net_device *dev)
+{
+ struct pcpu_tstats sum = { 0 };
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+
+ sum.rx_packets += tstats->rx_packets;
+ sum.rx_bytes += tstats->rx_bytes;
+ sum.tx_packets += tstats->tx_packets;
+ sum.tx_bytes += tstats->tx_bytes;
+ }
+ dev->stats.rx_packets = sum.rx_packets;
+ dev->stats.rx_bytes = sum.rx_bytes;
+ dev->stats.tx_packets = sum.tx_packets;
+ dev->stats.tx_bytes = sum.tx_bytes;
+ return &dev->stats;
+}
+
static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
__be32 remote, __be32 local)
{
- unsigned h0 = HASH(remote);
- unsigned h1 = HASH(local);
+ unsigned int h0 = HASH(remote);
+ unsigned int h1 = HASH(local);
struct ip_tunnel *t;
struct ipip_net *ipn = net_generic(net, ipip_net_id);
@@ -169,12 +197,12 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
return NULL;
}
-static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
+static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
struct ip_tunnel_parm *parms)
{
__be32 remote = parms->iph.daddr;
__be32 local = parms->iph.saddr;
- unsigned h = 0;
+ unsigned int h = 0;
int prio = 0;
if (remote) {
@@ -188,7 +216,7 @@ static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
return &ipn->tunnels[prio][h];
}
-static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
+static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
struct ip_tunnel *t)
{
return __ipip_bucket(ipn, &t->parms);
@@ -196,13 +224,14 @@ static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
{
- struct ip_tunnel **tp;
-
- for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) {
- if (t == *tp) {
- spin_lock_bh(&ipip_lock);
- *tp = t->next;
- spin_unlock_bh(&ipip_lock);
+ struct ip_tunnel __rcu **tp;
+ struct ip_tunnel *iter;
+
+ for (tp = ipip_bucket(ipn, t);
+ (iter = rtnl_dereference(*tp)) != NULL;
+ tp = &iter->next) {
+ if (t == iter) {
+ rcu_assign_pointer(*tp, t->next);
break;
}
}
@@ -210,12 +239,10 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
{
- struct ip_tunnel **tp = ipip_bucket(ipn, t);
+ struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
- spin_lock_bh(&ipip_lock);
- t->next = *tp;
+ rcu_assign_pointer(t->next, rtnl_dereference(*tp));
rcu_assign_pointer(*tp, t);
- spin_unlock_bh(&ipip_lock);
}
static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
@@ -223,12 +250,15 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
{
__be32 remote = parms->iph.daddr;
__be32 local = parms->iph.saddr;
- struct ip_tunnel *t, **tp, *nt;
+ struct ip_tunnel *t, *nt;
+ struct ip_tunnel __rcu **tp;
struct net_device *dev;
char name[IFNAMSIZ];
struct ipip_net *ipn = net_generic(net, ipip_net_id);
- for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) {
+ for (tp = __ipip_bucket(ipn, parms);
+ (t = rtnl_dereference(*tp)) != NULL;
+ tp = &t->next) {
if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
return t;
}
@@ -238,7 +268,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
if (parms->name[0])
strlcpy(name, parms->name, IFNAMSIZ);
else
- sprintf(name, "tunl%%d");
+ strcpy(name, "tunl%d");
dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
if (dev == NULL)
@@ -254,7 +284,8 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
nt = netdev_priv(dev);
nt->parms = *parms;
- ipip_tunnel_init(dev);
+ if (ipip_tunnel_init(dev) < 0)
+ goto failed_free;
if (register_netdevice(dev) < 0)
goto failed_free;
@@ -264,20 +295,19 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
return nt;
failed_free:
- free_netdev(dev);
+ ipip_dev_free(dev);
return NULL;
}
+/* called with RTNL */
static void ipip_tunnel_uninit(struct net_device *dev)
{
struct net *net = dev_net(dev);
struct ipip_net *ipn = net_generic(net, ipip_net_id);
- if (dev == ipn->fb_tunnel_dev) {
- spin_lock_bh(&ipip_lock);
- ipn->tunnels_wc[0] = NULL;
- spin_unlock_bh(&ipip_lock);
- } else
+ if (dev == ipn->fb_tunnel_dev)
+ rcu_assign_pointer(ipn->tunnels_wc[0], NULL);
+ else
ipip_tunnel_unlink(ipn, netdev_priv(dev));
dev_put(dev);
}
@@ -359,8 +389,10 @@ static int ipip_rcv(struct sk_buff *skb)
const struct iphdr *iph = ip_hdr(skb);
rcu_read_lock();
- if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev),
- iph->saddr, iph->daddr)) != NULL) {
+ tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
+ if (tunnel != NULL) {
+ struct pcpu_tstats *tstats;
+
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
rcu_read_unlock();
kfree_skb(skb);
@@ -374,10 +406,16 @@ static int ipip_rcv(struct sk_buff *skb)
skb->protocol = htons(ETH_P_IP);
skb->pkt_type = PACKET_HOST;
- skb_tunnel_rx(skb, tunnel->dev);
+ tstats = this_cpu_ptr(tunnel->dev->tstats);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+
+ __skb_tunnel_rx(skb, tunnel->dev);
ipip_ecn_decapsulate(iph, skb);
+
netif_rx(skb);
+
rcu_read_unlock();
return 0;
}
@@ -394,13 +432,12 @@ static int ipip_rcv(struct sk_buff *skb)
static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct net_device_stats *stats = &dev->stats;
- struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
+ struct pcpu_tstats *tstats;
struct iphdr *tiph = &tunnel->parms.iph;
u8 tos = tunnel->parms.iph.tos;
__be16 df = tiph->frag_off;
struct rtable *rt; /* Route to the other host */
- struct net_device *tdev; /* Device to other host */
+ struct net_device *tdev; /* Device to other host */
struct iphdr *old_iph = ip_hdr(skb);
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
@@ -410,13 +447,13 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if (skb->protocol != htons(ETH_P_IP))
goto tx_error;
- if (tos&1)
+ if (tos & 1)
tos = old_iph->tos;
if (!dst) {
/* NBMA tunnel */
if ((rt = skb_rtable(skb)) == NULL) {
- stats->tx_fifo_errors++;
+ dev->stats.tx_fifo_errors++;
goto tx_error;
}
if ((dst = rt->rt_gateway) == 0)
@@ -424,14 +461,20 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
}
{
- struct flowi fl = { .oif = tunnel->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = dst,
- .saddr = tiph->saddr,
- .tos = RT_TOS(tos) } },
- .proto = IPPROTO_IPIP };
+ struct flowi fl = {
+ .oif = tunnel->parms.link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = dst,
+ .saddr = tiph->saddr,
+ .tos = RT_TOS(tos)
+ }
+ },
+ .proto = IPPROTO_IPIP
+ };
+
if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
- stats->tx_carrier_errors++;
+ dev->stats.tx_carrier_errors++;
goto tx_error_icmp;
}
}
@@ -439,7 +482,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if (tdev == dev) {
ip_rt_put(rt);
- stats->collisions++;
+ dev->stats.collisions++;
goto tx_error;
}
@@ -449,7 +492,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
if (mtu < 68) {
- stats->collisions++;
+ dev->stats.collisions++;
ip_rt_put(rt);
goto tx_error;
}
@@ -485,7 +528,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
if (!new_skb) {
ip_rt_put(rt);
- txq->tx_dropped++;
+ dev->stats.tx_dropped++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -522,14 +565,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
iph->ttl = old_iph->ttl;
nf_reset(skb);
-
- IPTUNNEL_XMIT();
+ tstats = this_cpu_ptr(dev->tstats);
+ __IPTUNNEL_XMIT(tstats, &dev->stats);
return NETDEV_TX_OK;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
- stats->tx_errors++;
+ dev->stats.tx_errors++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -544,13 +587,19 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
iph = &tunnel->parms.iph;
if (iph->daddr) {
- struct flowi fl = { .oif = tunnel->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = iph->daddr,
- .saddr = iph->saddr,
- .tos = RT_TOS(iph->tos) } },
- .proto = IPPROTO_IPIP };
+ struct flowi fl = {
+ .oif = tunnel->parms.link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .tos = RT_TOS(iph->tos)
+ }
+ },
+ .proto = IPPROTO_IPIP
+ };
struct rtable *rt;
+
if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
tdev = rt->dst.dev;
ip_rt_put(rt);
@@ -696,13 +745,19 @@ static const struct net_device_ops ipip_netdev_ops = {
.ndo_start_xmit = ipip_tunnel_xmit,
.ndo_do_ioctl = ipip_tunnel_ioctl,
.ndo_change_mtu = ipip_tunnel_change_mtu,
-
+ .ndo_get_stats = ipip_get_stats,
};
+static void ipip_dev_free(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
static void ipip_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ipip_netdev_ops;
- dev->destructor = free_netdev;
+ dev->destructor = ipip_dev_free;
dev->type = ARPHRD_TUNNEL;
dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
@@ -711,10 +766,11 @@ static void ipip_tunnel_setup(struct net_device *dev)
dev->iflink = 0;
dev->addr_len = 4;
dev->features |= NETIF_F_NETNS_LOCAL;
+ dev->features |= NETIF_F_LLTX;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
}
-static void ipip_tunnel_init(struct net_device *dev)
+static int ipip_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
@@ -725,9 +781,15 @@ static void ipip_tunnel_init(struct net_device *dev)
memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
ipip_tunnel_bind_dev(dev);
+
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ return 0;
}
-static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
+static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
struct iphdr *iph = &tunnel->parms.iph;
@@ -740,11 +802,16 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
iph->protocol = IPPROTO_IPIP;
iph->ihl = 5;
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
dev_hold(dev);
- ipn->tunnels_wc[0] = tunnel;
+ rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
+ return 0;
}
-static struct xfrm_tunnel ipip_handler = {
+static struct xfrm_tunnel ipip_handler __read_mostly = {
.handler = ipip_rcv,
.err_handler = ipip_err,
.priority = 1,
@@ -760,11 +827,12 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
for (prio = 1; prio < 4; prio++) {
int h;
for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t = ipn->tunnels[prio][h];
+ struct ip_tunnel *t;
+ t = rtnl_dereference(ipn->tunnels[prio][h]);
while (t != NULL) {
unregister_netdevice_queue(t->dev, head);
- t = t->next;
+ t = rtnl_dereference(t->next);
}
}
}
@@ -789,7 +857,9 @@ static int __net_init ipip_init_net(struct net *net)
}
dev_net_set(ipn->fb_tunnel_dev, net);
- ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
+ err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
+ if (err)
+ goto err_reg_dev;
if ((err = register_netdev(ipn->fb_tunnel_dev)))
goto err_reg_dev;
@@ -797,7 +867,7 @@ static int __net_init ipip_init_net(struct net *net)
return 0;
err_reg_dev:
- free_netdev(ipn->fb_tunnel_dev);
+ ipip_dev_free(ipn->fb_tunnel_dev);
err_alloc_dev:
/* nothing */
return err;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 179fcab866f..86dd5691af4 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -75,7 +75,7 @@ struct mr_table {
struct net *net;
#endif
u32 id;
- struct sock *mroute_sk;
+ struct sock __rcu *mroute_sk;
struct timer_list ipmr_expire_timer;
struct list_head mfc_unres_queue;
struct list_head mfc_cache_array[MFC_LINES];
@@ -98,7 +98,7 @@ struct ipmr_result {
};
/* Big lock, protecting vif table, mrt cache and mroute socket state.
- Note that the changes are semaphored via rtnl_lock.
+ * Note that the changes are semaphored via rtnl_lock.
*/
static DEFINE_RWLOCK(mrt_lock);
@@ -113,11 +113,11 @@ static DEFINE_RWLOCK(mrt_lock);
static DEFINE_SPINLOCK(mfc_unres_lock);
/* We return to original Alan's scheme. Hash table of resolved
- entries is changed only in process context and protected
- with weak lock mrt_lock. Queue of unresolved entries is protected
- with strong spinlock mfc_unres_lock.
-
- In this case data path is free of exclusive locks at all.
+ * entries is changed only in process context and protected
+ * with weak lock mrt_lock. Queue of unresolved entries is protected
+ * with strong spinlock mfc_unres_lock.
+ *
+ * In this case data path is free of exclusive locks at all.
*/
static struct kmem_cache *mrt_cachep __read_mostly;
@@ -396,9 +396,9 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
set_fs(KERNEL_DS);
err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
set_fs(oldfs);
- } else
+ } else {
err = -EOPNOTSUPP;
-
+ }
dev = NULL;
if (err == 0 &&
@@ -495,7 +495,8 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
dev->iflink = 0;
rcu_read_lock();
- if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev) {
rcu_read_unlock();
goto failure;
}
@@ -552,9 +553,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
mrt->mroute_reg_vif_num = -1;
#endif
- if (vifi+1 == mrt->maxvif) {
+ if (vifi + 1 == mrt->maxvif) {
int tmp;
- for (tmp=vifi-1; tmp>=0; tmp--) {
+
+ for (tmp = vifi - 1; tmp >= 0; tmp--) {
if (VIF_EXISTS(mrt, tmp))
break;
}
@@ -565,25 +567,33 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
dev_set_allmulti(dev, -1);
- if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev) {
IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
ip_rt_multicast_event(in_dev);
}
- if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
+ if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
unregister_netdevice_queue(dev, head);
dev_put(dev);
return 0;
}
-static inline void ipmr_cache_free(struct mfc_cache *c)
+static void ipmr_cache_free_rcu(struct rcu_head *head)
{
+ struct mfc_cache *c = container_of(head, struct mfc_cache, rcu);
+
kmem_cache_free(mrt_cachep, c);
}
+static inline void ipmr_cache_free(struct mfc_cache *c)
+{
+ call_rcu(&c->rcu, ipmr_cache_free_rcu);
+}
+
/* Destroy an unresolved cache entry, killing queued skbs
- and reporting error to netlink readers.
+ * and reporting error to netlink readers.
*/
static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
@@ -605,8 +615,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
memset(&e->msg, 0, sizeof(e->msg));
rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
- } else
+ } else {
kfree_skb(skb);
+ }
}
ipmr_cache_free(c);
@@ -724,13 +735,13 @@ static int vif_add(struct net *net, struct mr_table *mrt,
case 0:
if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
- if (dev && dev->ip_ptr == NULL) {
+ if (dev && __in_dev_get_rtnl(dev) == NULL) {
dev_put(dev);
return -EADDRNOTAVAIL;
}
- } else
+ } else {
dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
-
+ }
if (!dev)
return -EADDRNOTAVAIL;
err = dev_set_allmulti(dev, 1);
@@ -743,16 +754,16 @@ static int vif_add(struct net *net, struct mr_table *mrt,
return -EINVAL;
}
- if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev) {
dev_put(dev);
return -EADDRNOTAVAIL;
}
IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
ip_rt_multicast_event(in_dev);
- /*
- * Fill in the VIF structures
- */
+ /* Fill in the VIF structures */
+
v->rate_limit = vifc->vifc_rate_limit;
v->local = vifc->vifc_lcl_addr.s_addr;
v->remote = vifc->vifc_rmt_addr.s_addr;
@@ -765,14 +776,14 @@ static int vif_add(struct net *net, struct mr_table *mrt,
v->pkt_in = 0;
v->pkt_out = 0;
v->link = dev->ifindex;
- if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
+ if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
v->link = dev->iflink;
/* And finish update writing critical data */
write_lock_bh(&mrt_lock);
v->dev = dev;
#ifdef CONFIG_IP_PIMSM
- if (v->flags&VIFF_REGISTER)
+ if (v->flags & VIFF_REGISTER)
mrt->mroute_reg_vif_num = vifi;
#endif
if (vifi+1 > mrt->maxvif)
@@ -781,6 +792,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
return 0;
}
+/* called with rcu_read_lock() */
static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
__be32 origin,
__be32 mcastgrp)
@@ -788,7 +800,7 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
int line = MFC_HASH(mcastgrp, origin);
struct mfc_cache *c;
- list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
+ list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
return c;
}
@@ -801,19 +813,20 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
static struct mfc_cache *ipmr_cache_alloc(void)
{
struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
- if (c == NULL)
- return NULL;
- c->mfc_un.res.minvif = MAXVIFS;
+
+ if (c)
+ c->mfc_un.res.minvif = MAXVIFS;
return c;
}
static struct mfc_cache *ipmr_cache_alloc_unres(void)
{
struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
- if (c == NULL)
- return NULL;
- skb_queue_head_init(&c->mfc_un.unres.unresolved);
- c->mfc_un.unres.expires = jiffies + 10*HZ;
+
+ if (c) {
+ skb_queue_head_init(&c->mfc_un.unres.unresolved);
+ c->mfc_un.unres.expires = jiffies + 10*HZ;
+ }
return c;
}
@@ -827,17 +840,15 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
struct sk_buff *skb;
struct nlmsgerr *e;
- /*
- * Play the pending entries through our router
- */
+ /* Play the pending entries through our router */
while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
if (ip_hdr(skb)->version == 0) {
struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
- nlh->nlmsg_len = (skb_tail_pointer(skb) -
- (u8 *)nlh);
+ nlh->nlmsg_len = skb_tail_pointer(skb) -
+ (u8 *)nlh;
} else {
nlh->nlmsg_type = NLMSG_ERROR;
nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
@@ -848,8 +859,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
}
rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
- } else
+ } else {
ip_mr_forward(net, mrt, skb, c, 0);
+ }
}
}
@@ -867,6 +879,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
const int ihl = ip_hdrlen(pkt);
struct igmphdr *igmp;
struct igmpmsg *msg;
+ struct sock *mroute_sk;
int ret;
#ifdef CONFIG_IP_PIMSM
@@ -882,9 +895,9 @@ static int ipmr_cache_report(struct mr_table *mrt,
#ifdef CONFIG_IP_PIMSM
if (assert == IGMPMSG_WHOLEPKT) {
/* Ugly, but we have no choice with this interface.
- Duplicate old header, fix ihl, length etc.
- And all this only to mangle msg->im_msgtype and
- to set msg->im_mbz to "mbz" :-)
+ * Duplicate old header, fix ihl, length etc.
+ * And all this only to mangle msg->im_msgtype and
+ * to set msg->im_mbz to "mbz" :-)
*/
skb_push(skb, sizeof(struct iphdr));
skb_reset_network_header(skb);
@@ -901,39 +914,38 @@ static int ipmr_cache_report(struct mr_table *mrt,
#endif
{
- /*
- * Copy the IP header
- */
+ /* Copy the IP header */
skb->network_header = skb->tail;
skb_put(skb, ihl);
skb_copy_to_linear_data(skb, pkt->data, ihl);
- ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
+ ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
msg = (struct igmpmsg *)skb_network_header(skb);
msg->im_vif = vifi;
skb_dst_set(skb, dst_clone(skb_dst(pkt)));
- /*
- * Add our header
- */
+ /* Add our header */
- igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+ igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
igmp->type =
msg->im_msgtype = assert;
- igmp->code = 0;
- ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
+ igmp->code = 0;
+ ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
skb->transport_header = skb->network_header;
}
- if (mrt->mroute_sk == NULL) {
+ rcu_read_lock();
+ mroute_sk = rcu_dereference(mrt->mroute_sk);
+ if (mroute_sk == NULL) {
+ rcu_read_unlock();
kfree_skb(skb);
return -EINVAL;
}
- /*
- * Deliver to mrouted
- */
- ret = sock_queue_rcv_skb(mrt->mroute_sk, skb);
+ /* Deliver to mrouted */
+
+ ret = sock_queue_rcv_skb(mroute_sk, skb);
+ rcu_read_unlock();
if (ret < 0) {
if (net_ratelimit())
printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
@@ -965,9 +977,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
}
if (!found) {
- /*
- * Create a new entry if allowable
- */
+ /* Create a new entry if allowable */
if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
(c = ipmr_cache_alloc_unres()) == NULL) {
@@ -977,16 +987,14 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
return -ENOBUFS;
}
- /*
- * Fill in the new cache entry
- */
+ /* Fill in the new cache entry */
+
c->mfc_parent = -1;
c->mfc_origin = iph->saddr;
c->mfc_mcastgrp = iph->daddr;
- /*
- * Reflect first query at mrouted.
- */
+ /* Reflect first query at mrouted. */
+
err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
if (err < 0) {
/* If the report failed throw the cache entry
@@ -1006,10 +1014,9 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
}
- /*
- * See if we can append the packet
- */
- if (c->mfc_un.unres.unresolved.qlen>3) {
+ /* See if we can append the packet */
+
+ if (c->mfc_un.unres.unresolved.qlen > 3) {
kfree_skb(skb);
err = -ENOBUFS;
} else {
@@ -1035,9 +1042,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
- write_lock_bh(&mrt_lock);
- list_del(&c->list);
- write_unlock_bh(&mrt_lock);
+ list_del_rcu(&c->list);
ipmr_cache_free(c);
return 0;
@@ -1090,9 +1095,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
if (!mrtsock)
c->mfc_flags |= MFC_STATIC;
- write_lock_bh(&mrt_lock);
- list_add(&c->list, &mrt->mfc_cache_array[line]);
- write_unlock_bh(&mrt_lock);
+ list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
/*
* Check to see if we resolved a queued list. If so we
@@ -1130,26 +1133,21 @@ static void mroute_clean_tables(struct mr_table *mrt)
LIST_HEAD(list);
struct mfc_cache *c, *next;
- /*
- * Shut down all active vif entries
- */
+ /* Shut down all active vif entries */
+
for (i = 0; i < mrt->maxvif; i++) {
- if (!(mrt->vif_table[i].flags&VIFF_STATIC))
+ if (!(mrt->vif_table[i].flags & VIFF_STATIC))
vif_delete(mrt, i, 0, &list);
}
unregister_netdevice_many(&list);
- /*
- * Wipe the cache
- */
+ /* Wipe the cache */
+
for (i = 0; i < MFC_LINES; i++) {
list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
- if (c->mfc_flags&MFC_STATIC)
+ if (c->mfc_flags & MFC_STATIC)
continue;
- write_lock_bh(&mrt_lock);
- list_del(&c->list);
- write_unlock_bh(&mrt_lock);
-
+ list_del_rcu(&c->list);
ipmr_cache_free(c);
}
}
@@ -1164,6 +1162,9 @@ static void mroute_clean_tables(struct mr_table *mrt)
}
}
+/* called from ip_ra_control(), before an RCU grace period,
+ * we dont need to call synchronize_rcu() here
+ */
static void mrtsock_destruct(struct sock *sk)
{
struct net *net = sock_net(sk);
@@ -1171,13 +1172,9 @@ static void mrtsock_destruct(struct sock *sk)
rtnl_lock();
ipmr_for_each_table(mrt, net) {
- if (sk == mrt->mroute_sk) {
+ if (sk == rtnl_dereference(mrt->mroute_sk)) {
IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
-
- write_lock_bh(&mrt_lock);
- mrt->mroute_sk = NULL;
- write_unlock_bh(&mrt_lock);
-
+ rcu_assign_pointer(mrt->mroute_sk, NULL);
mroute_clean_tables(mrt);
}
}
@@ -1204,7 +1201,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
return -ENOENT;
if (optname != MRT_INIT) {
- if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN))
+ if (sk != rcu_dereference_raw(mrt->mroute_sk) &&
+ !capable(CAP_NET_ADMIN))
return -EACCES;
}
@@ -1217,23 +1215,20 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
return -ENOPROTOOPT;
rtnl_lock();
- if (mrt->mroute_sk) {
+ if (rtnl_dereference(mrt->mroute_sk)) {
rtnl_unlock();
return -EADDRINUSE;
}
ret = ip_ra_control(sk, 1, mrtsock_destruct);
if (ret == 0) {
- write_lock_bh(&mrt_lock);
- mrt->mroute_sk = sk;
- write_unlock_bh(&mrt_lock);
-
+ rcu_assign_pointer(mrt->mroute_sk, sk);
IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
}
rtnl_unlock();
return ret;
case MRT_DONE:
- if (sk != mrt->mroute_sk)
+ if (sk != rcu_dereference_raw(mrt->mroute_sk))
return -EACCES;
return ip_ra_control(sk, 0, NULL);
case MRT_ADD_VIF:
@@ -1246,7 +1241,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
return -ENFILE;
rtnl_lock();
if (optname == MRT_ADD_VIF) {
- ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk);
+ ret = vif_add(net, mrt, &vif,
+ sk == rtnl_dereference(mrt->mroute_sk));
} else {
ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
}
@@ -1267,7 +1263,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
if (optname == MRT_DEL_MFC)
ret = ipmr_mfc_delete(mrt, &mfc);
else
- ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk);
+ ret = ipmr_mfc_add(net, mrt, &mfc,
+ sk == rtnl_dereference(mrt->mroute_sk));
rtnl_unlock();
return ret;
/*
@@ -1276,7 +1273,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
case MRT_ASSERT:
{
int v;
- if (get_user(v,(int __user *)optval))
+ if (get_user(v, (int __user *)optval))
return -EFAULT;
mrt->mroute_do_assert = (v) ? 1 : 0;
return 0;
@@ -1286,7 +1283,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
{
int v;
- if (get_user(v,(int __user *)optval))
+ if (get_user(v, (int __user *)optval))
return -EFAULT;
v = (v) ? 1 : 0;
@@ -1309,14 +1306,16 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
return -EINVAL;
if (get_user(v, (u32 __user *)optval))
return -EFAULT;
- if (sk == mrt->mroute_sk)
- return -EBUSY;
rtnl_lock();
ret = 0;
- if (!ipmr_new_table(net, v))
- ret = -ENOMEM;
- raw_sk(sk)->ipmr_table = v;
+ if (sk == rtnl_dereference(mrt->mroute_sk)) {
+ ret = -EBUSY;
+ } else {
+ if (!ipmr_new_table(net, v))
+ ret = -ENOMEM;
+ raw_sk(sk)->ipmr_table = v;
+ }
rtnl_unlock();
return ret;
}
@@ -1347,9 +1346,9 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
if (optname != MRT_VERSION &&
#ifdef CONFIG_IP_PIMSM
- optname!=MRT_PIM &&
+ optname != MRT_PIM &&
#endif
- optname!=MRT_ASSERT)
+ optname != MRT_ASSERT)
return -ENOPROTOOPT;
if (get_user(olr, optlen))
@@ -1416,19 +1415,19 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
if (copy_from_user(&sr, arg, sizeof(sr)))
return -EFAULT;
- read_lock(&mrt_lock);
+ rcu_read_lock();
c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
if (c) {
sr.pktcnt = c->mfc_un.res.pkt;
sr.bytecnt = c->mfc_un.res.bytes;
sr.wrong_if = c->mfc_un.res.wrong_if;
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
if (copy_to_user(arg, &sr, sizeof(sr)))
return -EFAULT;
return 0;
}
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -EADDRNOTAVAIL;
default:
return -ENOIOCTLCMD;
@@ -1465,7 +1464,7 @@ static struct notifier_block ip_mr_notifier = {
};
/*
- * Encapsulate a packet by attaching a valid IPIP header to it.
+ * Encapsulate a packet by attaching a valid IPIP header to it.
* This avoids tunnel drivers and other mess and gives us the speed so
* important for multicast video.
*/
@@ -1480,7 +1479,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
skb_reset_network_header(skb);
iph = ip_hdr(skb);
- iph->version = 4;
+ iph->version = 4;
iph->tos = old_iph->tos;
iph->ttl = old_iph->ttl;
iph->frag_off = 0;
@@ -1498,7 +1497,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
static inline int ipmr_forward_finish(struct sk_buff *skb)
{
- struct ip_options * opt = &(IPCB(skb)->opt);
+ struct ip_options *opt = &(IPCB(skb)->opt);
IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
@@ -1535,22 +1534,34 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
}
#endif
- if (vif->flags&VIFF_TUNNEL) {
- struct flowi fl = { .oif = vif->link,
- .nl_u = { .ip4_u =
- { .daddr = vif->remote,
- .saddr = vif->local,
- .tos = RT_TOS(iph->tos) } },
- .proto = IPPROTO_IPIP };
+ if (vif->flags & VIFF_TUNNEL) {
+ struct flowi fl = {
+ .oif = vif->link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = vif->remote,
+ .saddr = vif->local,
+ .tos = RT_TOS(iph->tos)
+ }
+ },
+ .proto = IPPROTO_IPIP
+ };
+
if (ip_route_output_key(net, &rt, &fl))
goto out_free;
encap = sizeof(struct iphdr);
} else {
- struct flowi fl = { .oif = vif->link,
- .nl_u = { .ip4_u =
- { .daddr = iph->daddr,
- .tos = RT_TOS(iph->tos) } },
- .proto = IPPROTO_IPIP };
+ struct flowi fl = {
+ .oif = vif->link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = iph->daddr,
+ .tos = RT_TOS(iph->tos)
+ }
+ },
+ .proto = IPPROTO_IPIP
+ };
+
if (ip_route_output_key(net, &rt, &fl))
goto out_free;
}
@@ -1559,8 +1570,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
/* Do not fragment multicasts. Alas, IPv4 does not
- allow to send ICMP, so that packets will disappear
- to blackhole.
+ * allow to send ICMP, so that packets will disappear
+ * to blackhole.
*/
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
@@ -1583,7 +1594,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
ip_decrease_ttl(ip_hdr(skb));
/* FIXME: forward and output firewalls used to be called here.
- * What do we do with netfilter? -- RR */
+ * What do we do with netfilter? -- RR
+ */
if (vif->flags & VIFF_TUNNEL) {
ip_encap(skb, vif->local, vif->remote);
/* FIXME: extra output firewall step used to be here. --RR */
@@ -1644,15 +1656,15 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
if (skb_rtable(skb)->fl.iif == 0) {
/* It is our own packet, looped back.
- Very complicated situation...
-
- The best workaround until routing daemons will be
- fixed is not to redistribute packet, if it was
- send through wrong interface. It means, that
- multicast applications WILL NOT work for
- (S,G), which have default multicast route pointing
- to wrong oif. In any case, it is not a good
- idea to use multicasting applications on router.
+ * Very complicated situation...
+ *
+ * The best workaround until routing daemons will be
+ * fixed is not to redistribute packet, if it was
+ * send through wrong interface. It means, that
+ * multicast applications WILL NOT work for
+ * (S,G), which have default multicast route pointing
+ * to wrong oif. In any case, it is not a good
+ * idea to use multicasting applications on router.
*/
goto dont_forward;
}
@@ -1662,9 +1674,9 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
if (true_vifi >= 0 && mrt->mroute_do_assert &&
/* pimsm uses asserts, when switching from RPT to SPT,
- so that we cannot check that packet arrived on an oif.
- It is bad, but otherwise we would need to move pretty
- large chunk of pimd to kernel. Ough... --ANK
+ * so that we cannot check that packet arrived on an oif.
+ * It is bad, but otherwise we would need to move pretty
+ * large chunk of pimd to kernel. Ough... --ANK
*/
(mrt->mroute_do_pim ||
cache->mfc_un.res.ttls[true_vifi] < 255) &&
@@ -1682,10 +1694,12 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
/*
* Forward the frame
*/
- for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
+ for (ct = cache->mfc_un.res.maxvif - 1;
+ ct >= cache->mfc_un.res.minvif; ct--) {
if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
if (psend != -1) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
if (skb2)
ipmr_queue_xmit(net, mrt, skb2, cache,
psend);
@@ -1696,6 +1710,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
if (psend != -1) {
if (local) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
if (skb2)
ipmr_queue_xmit(net, mrt, skb2, cache, psend);
} else {
@@ -1713,6 +1728,7 @@ dont_forward:
/*
* Multicast packets for forwarding arrive here
+ * Called with rcu_read_lock();
*/
int ip_mr_input(struct sk_buff *skb)
@@ -1724,9 +1740,9 @@ int ip_mr_input(struct sk_buff *skb)
int err;
/* Packet is looped back after forward, it should not be
- forwarded second time, but still can be delivered locally.
+ * forwarded second time, but still can be delivered locally.
*/
- if (IPCB(skb)->flags&IPSKB_FORWARDED)
+ if (IPCB(skb)->flags & IPSKB_FORWARDED)
goto dont_forward;
err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
@@ -1736,28 +1752,28 @@ int ip_mr_input(struct sk_buff *skb)
}
if (!local) {
- if (IPCB(skb)->opt.router_alert) {
- if (ip_call_ra_chain(skb))
- return 0;
- } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
- /* IGMPv1 (and broken IGMPv2 implementations sort of
- Cisco IOS <= 11.2(8)) do not put router alert
- option to IGMP packets destined to routable
- groups. It is very bad, because it means
- that we can forward NO IGMP messages.
- */
- read_lock(&mrt_lock);
- if (mrt->mroute_sk) {
- nf_reset(skb);
- raw_rcv(mrt->mroute_sk, skb);
- read_unlock(&mrt_lock);
- return 0;
- }
- read_unlock(&mrt_lock);
+ if (IPCB(skb)->opt.router_alert) {
+ if (ip_call_ra_chain(skb))
+ return 0;
+ } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
+ /* IGMPv1 (and broken IGMPv2 implementations sort of
+ * Cisco IOS <= 11.2(8)) do not put router alert
+ * option to IGMP packets destined to routable
+ * groups. It is very bad, because it means
+ * that we can forward NO IGMP messages.
+ */
+ struct sock *mroute_sk;
+
+ mroute_sk = rcu_dereference(mrt->mroute_sk);
+ if (mroute_sk) {
+ nf_reset(skb);
+ raw_rcv(mroute_sk, skb);
+ return 0;
+ }
}
}
- read_lock(&mrt_lock);
+ /* already under rcu_read_lock() */
cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
/*
@@ -1769,13 +1785,12 @@ int ip_mr_input(struct sk_buff *skb)
if (local) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
ip_local_deliver(skb);
- if (skb2 == NULL) {
- read_unlock(&mrt_lock);
+ if (skb2 == NULL)
return -ENOBUFS;
- }
skb = skb2;
}
+ read_lock(&mrt_lock);
vif = ipmr_find_vif(mrt, skb->dev);
if (vif >= 0) {
int err2 = ipmr_cache_unresolved(mrt, vif, skb);
@@ -1788,8 +1803,8 @@ int ip_mr_input(struct sk_buff *skb)
return -ENODEV;
}
+ read_lock(&mrt_lock);
ip_mr_forward(net, mrt, skb, cache, local);
-
read_unlock(&mrt_lock);
if (local)
@@ -1805,6 +1820,7 @@ dont_forward:
}
#ifdef CONFIG_IP_PIMSM
+/* called with rcu_read_lock() */
static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
unsigned int pimlen)
{
@@ -1813,10 +1829,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
/*
- Check that:
- a. packet is really destinted to a multicast group
- b. packet is not a NULL-REGISTER
- c. packet is not truncated
+ * Check that:
+ * a. packet is really sent to a multicast group
+ * b. packet is not a NULL-REGISTER
+ * c. packet is not truncated
*/
if (!ipv4_is_multicast(encap->daddr) ||
encap->tot_len == 0 ||
@@ -1826,26 +1842,23 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
read_lock(&mrt_lock);
if (mrt->mroute_reg_vif_num >= 0)
reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
- if (reg_dev)
- dev_hold(reg_dev);
read_unlock(&mrt_lock);
if (reg_dev == NULL)
return 1;
skb->mac_header = skb->network_header;
- skb_pull(skb, (u8*)encap - skb->data);
+ skb_pull(skb, (u8 *)encap - skb->data);
skb_reset_network_header(skb);
skb->protocol = htons(ETH_P_IP);
- skb->ip_summed = 0;
+ skb->ip_summed = CHECKSUM_NONE;
skb->pkt_type = PACKET_HOST;
skb_tunnel_rx(skb, reg_dev);
netif_rx(skb);
- dev_put(reg_dev);
- return 0;
+ return NET_RX_SUCCESS;
}
#endif
@@ -1854,7 +1867,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
* Handle IGMP messages of PIMv1
*/
-int pim_rcv_v1(struct sk_buff * skb)
+int pim_rcv_v1(struct sk_buff *skb)
{
struct igmphdr *pim;
struct net *net = dev_net(skb->dev);
@@ -1881,7 +1894,7 @@ drop:
#endif
#ifdef CONFIG_IP_PIMSM_V2
-static int pim_rcv(struct sk_buff * skb)
+static int pim_rcv(struct sk_buff *skb)
{
struct pimreghdr *pim;
struct net *net = dev_net(skb->dev);
@@ -1891,8 +1904,8 @@ static int pim_rcv(struct sk_buff * skb)
goto drop;
pim = (struct pimreghdr *)skb_transport_header(skb);
- if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
- (pim->flags&PIM_NULL_REGISTER) ||
+ if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
+ (pim->flags & PIM_NULL_REGISTER) ||
(ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
csum_fold(skb_checksum(skb, 0, skb->len, 0))))
goto drop;
@@ -1958,28 +1971,33 @@ int ipmr_get_route(struct net *net,
if (mrt == NULL)
return -ENOENT;
- read_lock(&mrt_lock);
+ rcu_read_lock();
cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
if (cache == NULL) {
struct sk_buff *skb2;
struct iphdr *iph;
struct net_device *dev;
- int vif;
+ int vif = -1;
if (nowait) {
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -EAGAIN;
}
dev = skb->dev;
- if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) {
+ read_lock(&mrt_lock);
+ if (dev)
+ vif = ipmr_find_vif(mrt, dev);
+ if (vif < 0) {
read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -ENODEV;
}
skb2 = skb_clone(skb, GFP_ATOMIC);
if (!skb2) {
read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -ENOMEM;
}
@@ -1992,13 +2010,16 @@ int ipmr_get_route(struct net *net,
iph->version = 0;
err = ipmr_cache_unresolved(mrt, vif, skb2);
read_unlock(&mrt_lock);
+ rcu_read_unlock();
return err;
}
- if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
+ read_lock(&mrt_lock);
+ if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY))
cache->mfc_flags |= MFC_NOTIFY;
err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
read_unlock(&mrt_lock);
+ rcu_read_unlock();
return err;
}
@@ -2050,14 +2071,14 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
s_h = cb->args[1];
s_e = cb->args[2];
- read_lock(&mrt_lock);
+ rcu_read_lock();
ipmr_for_each_table(mrt, net) {
if (t < s_t)
goto next_table;
if (t > s_t)
s_h = 0;
for (h = s_h; h < MFC_LINES; h++) {
- list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) {
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
if (e < s_e)
goto next_entry;
if (ipmr_fill_mroute(mrt, skb,
@@ -2075,7 +2096,7 @@ next_table:
t++;
}
done:
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
cb->args[2] = e;
cb->args[1] = h;
@@ -2086,7 +2107,8 @@ done:
#ifdef CONFIG_PROC_FS
/*
- * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
+ * The /proc interfaces to multicast routing :
+ * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
*/
struct ipmr_vif_iter {
struct seq_net_private p;
@@ -2208,14 +2230,14 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
struct mr_table *mrt = it->mrt;
struct mfc_cache *mfc;
- read_lock(&mrt_lock);
+ rcu_read_lock();
for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
it->cache = &mrt->mfc_cache_array[it->ct];
- list_for_each_entry(mfc, it->cache, list)
+ list_for_each_entry_rcu(mfc, it->cache, list)
if (pos-- == 0)
return mfc;
}
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
spin_lock_bh(&mfc_unres_lock);
it->cache = &mrt->mfc_unres_queue;
@@ -2274,7 +2296,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
/* exhausted cache_array, show unresolved */
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
it->cache = &mrt->mfc_unres_queue;
it->ct = 0;
@@ -2282,7 +2304,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
if (!list_empty(it->cache))
return list_first_entry(it->cache, struct mfc_cache, list);
- end_of_list:
+end_of_list:
spin_unlock_bh(&mfc_unres_lock);
it->cache = NULL;
@@ -2297,7 +2319,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
if (it->cache == &mrt->mfc_unres_queue)
spin_unlock_bh(&mfc_unres_lock);
else if (it->cache == &mrt->mfc_cache_array[it->ct])
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
}
static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
@@ -2323,7 +2345,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
mfc->mfc_un.res.bytes,
mfc->mfc_un.res.wrong_if);
for (n = mfc->mfc_un.res.minvif;
- n < mfc->mfc_un.res.maxvif; n++ ) {
+ n < mfc->mfc_un.res.maxvif; n++) {
if (VIF_EXISTS(mrt, n) &&
mfc->mfc_un.res.ttls[n] < 255)
seq_printf(seq,
@@ -2421,7 +2443,7 @@ int __init ip_mr_init(void)
mrt_cachep = kmem_cache_create("ip_mrt_cache",
sizeof(struct mfc_cache),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
NULL);
if (!mrt_cachep)
return -ENOMEM;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index d048275a62c..babd1a2bae5 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -324,10 +324,10 @@ config IP_NF_TARGET_ECN
config IP_NF_TARGET_TTL
tristate '"TTL" target support'
- depends on NETFILTER_ADVANCED
+ depends on NETFILTER_ADVANCED && IP_NF_MANGLE
select NETFILTER_XT_TARGET_HL
---help---
- This is a backwards-compat option for the user's convenience
+ This is a backwards-compatible option for the user's convenience
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_TARGET_HL.
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 6bccba31d13..3cad2591ace 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -72,7 +72,7 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
for (i = 0; i < len; i++)
ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
- return (ret != 0);
+ return ret != 0;
}
/*
@@ -228,7 +228,7 @@ arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
return NF_DROP;
}
-static inline const struct arpt_entry_target *
+static inline const struct xt_entry_target *
arpt_get_target_c(const struct arpt_entry *e)
{
return arpt_get_target((struct arpt_entry *)e);
@@ -282,7 +282,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
arp = arp_hdr(skb);
do {
- const struct arpt_entry_target *t;
+ const struct xt_entry_target *t;
if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
e = arpt_next_entry(e);
@@ -297,10 +297,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
if (!t->u.kernel.target->target) {
int v;
- v = ((struct arpt_standard_target *)t)->verdict;
+ v = ((struct xt_standard_target *)t)->verdict;
if (v < 0) {
/* Pop from stack? */
- if (v != ARPT_RETURN) {
+ if (v != XT_RETURN) {
verdict = (unsigned)(-v) - 1;
break;
}
@@ -332,7 +332,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
/* Target might have changed stuff. */
arp = arp_hdr(skb);
- if (verdict == ARPT_CONTINUE)
+ if (verdict == XT_CONTINUE)
e = arpt_next_entry(e);
else
/* Verdict */
@@ -377,7 +377,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
e->counters.pcnt = pos;
for (;;) {
- const struct arpt_standard_target *t
+ const struct xt_standard_target *t
= (void *)arpt_get_target_c(e);
int visited = e->comefrom & (1 << hook);
@@ -392,13 +392,13 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
/* Unconditional return/END. */
if ((e->target_offset == sizeof(struct arpt_entry) &&
(strcmp(t->target.u.user.name,
- ARPT_STANDARD_TARGET) == 0) &&
+ XT_STANDARD_TARGET) == 0) &&
t->verdict < 0 && unconditional(&e->arp)) ||
visited) {
unsigned int oldpos, size;
if ((strcmp(t->target.u.user.name,
- ARPT_STANDARD_TARGET) == 0) &&
+ XT_STANDARD_TARGET) == 0) &&
t->verdict < -NF_MAX_VERDICT - 1) {
duprintf("mark_source_chains: bad "
"negative verdict (%i)\n",
@@ -433,7 +433,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
int newpos = t->verdict;
if (strcmp(t->target.u.user.name,
- ARPT_STANDARD_TARGET) == 0 &&
+ XT_STANDARD_TARGET) == 0 &&
newpos >= 0) {
if (newpos > newinfo->size -
sizeof(struct arpt_entry)) {
@@ -464,14 +464,14 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
static inline int check_entry(const struct arpt_entry *e, const char *name)
{
- const struct arpt_entry_target *t;
+ const struct xt_entry_target *t;
if (!arp_checkentry(&e->arp)) {
duprintf("arp_tables: arp check failed %p %s.\n", e, name);
return -EINVAL;
}
- if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset)
+ if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
return -EINVAL;
t = arpt_get_target_c(e);
@@ -483,7 +483,7 @@ static inline int check_entry(const struct arpt_entry *e, const char *name)
static inline int check_target(struct arpt_entry *e, const char *name)
{
- struct arpt_entry_target *t = arpt_get_target(e);
+ struct xt_entry_target *t = arpt_get_target(e);
int ret;
struct xt_tgchk_param par = {
.table = name,
@@ -506,7 +506,7 @@ static inline int check_target(struct arpt_entry *e, const char *name)
static inline int
find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
{
- struct arpt_entry_target *t;
+ struct xt_entry_target *t;
struct xt_target *target;
int ret;
@@ -536,7 +536,7 @@ out:
static bool check_underflow(const struct arpt_entry *e)
{
- const struct arpt_entry_target *t;
+ const struct xt_entry_target *t;
unsigned int verdict;
if (!unconditional(&e->arp))
@@ -544,7 +544,7 @@ static bool check_underflow(const struct arpt_entry *e)
t = arpt_get_target_c(e);
if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
return false;
- verdict = ((struct arpt_standard_target *)t)->verdict;
+ verdict = ((struct xt_standard_target *)t)->verdict;
verdict = -verdict - 1;
return verdict == NF_DROP || verdict == NF_ACCEPT;
}
@@ -566,7 +566,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
}
if (e->next_offset
- < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) {
+ < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) {
duprintf("checking: element %p size %u\n",
e, e->next_offset);
return -EINVAL;
@@ -598,7 +598,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
static inline void cleanup_entry(struct arpt_entry *e)
{
struct xt_tgdtor_param par;
- struct arpt_entry_target *t;
+ struct xt_entry_target *t;
t = arpt_get_target(e);
par.target = t->u.kernel.target;
@@ -735,6 +735,7 @@ static void get_counters(const struct xt_table_info *t,
if (cpu == curcpu)
continue;
i = 0;
+ local_bh_disable();
xt_info_wrlock(cpu);
xt_entry_foreach(iter, t->entries[cpu], t->size) {
ADD_COUNTER(counters[i], iter->counters.bcnt,
@@ -742,6 +743,7 @@ static void get_counters(const struct xt_table_info *t,
++i;
}
xt_info_wrunlock(cpu);
+ local_bh_enable();
}
put_cpu();
}
@@ -792,7 +794,7 @@ static int copy_entries_to_user(unsigned int total_size,
/* FIXME: use iterator macros --RR */
/* ... then go back and fix counters and names */
for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
- const struct arpt_entry_target *t;
+ const struct xt_entry_target *t;
e = (struct arpt_entry *)(loc_cpu_entry + off);
if (copy_to_user(userptr + off
@@ -805,7 +807,7 @@ static int copy_entries_to_user(unsigned int total_size,
t = arpt_get_target_c(e);
if (copy_to_user(userptr + off + e->target_offset
- + offsetof(struct arpt_entry_target,
+ + offsetof(struct xt_entry_target,
u.user.name),
t->u.kernel.target->name,
strlen(t->u.kernel.target->name)+1) != 0) {
@@ -842,7 +844,7 @@ static int compat_calc_entry(const struct arpt_entry *e,
const struct xt_table_info *info,
const void *base, struct xt_table_info *newinfo)
{
- const struct arpt_entry_target *t;
+ const struct xt_entry_target *t;
unsigned int entry_offset;
int off, i, ret;
@@ -893,7 +895,7 @@ static int compat_table_info(const struct xt_table_info *info,
static int get_info(struct net *net, void __user *user,
const int *len, int compat)
{
- char name[ARPT_TABLE_MAXNAMELEN];
+ char name[XT_TABLE_MAXNAMELEN];
struct xt_table *t;
int ret;
@@ -906,7 +908,7 @@ static int get_info(struct net *net, void __user *user,
if (copy_from_user(name, user, sizeof(name)) != 0)
return -EFAULT;
- name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
+ name[XT_TABLE_MAXNAMELEN-1] = '\0';
#ifdef CONFIG_COMPAT
if (compat)
xt_compat_lock(NFPROTO_ARP);
@@ -1202,7 +1204,7 @@ static int do_add_counters(struct net *net, const void __user *user,
#ifdef CONFIG_COMPAT
static inline void compat_release_entry(struct compat_arpt_entry *e)
{
- struct arpt_entry_target *t;
+ struct xt_entry_target *t;
t = compat_arpt_get_target(e);
module_put(t->u.kernel.target->me);
@@ -1218,7 +1220,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
const unsigned int *underflows,
const char *name)
{
- struct arpt_entry_target *t;
+ struct xt_entry_target *t;
struct xt_target *target;
unsigned int entry_offset;
int ret, off, h;
@@ -1286,7 +1288,7 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
unsigned int *size, const char *name,
struct xt_table_info *newinfo, unsigned char *base)
{
- struct arpt_entry_target *t;
+ struct xt_entry_target *t;
struct xt_target *target;
struct arpt_entry *de;
unsigned int origsize;
@@ -1418,6 +1420,9 @@ static int translate_compat_table(const char *name,
if (ret != 0)
break;
++i;
+ if (strcmp(arpt_get_target(iter1)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
}
if (ret) {
/*
@@ -1469,7 +1474,7 @@ out_unlock:
}
struct compat_arpt_replace {
- char name[ARPT_TABLE_MAXNAMELEN];
+ char name[XT_TABLE_MAXNAMELEN];
u32 valid_hooks;
u32 num_entries;
u32 size;
@@ -1562,7 +1567,7 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
struct xt_counters *counters,
unsigned int i)
{
- struct arpt_entry_target *t;
+ struct xt_entry_target *t;
struct compat_arpt_entry __user *ce;
u_int16_t target_offset, next_offset;
compat_uint_t origsize;
@@ -1623,7 +1628,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
}
struct compat_arpt_get_entries {
- char name[ARPT_TABLE_MAXNAMELEN];
+ char name[XT_TABLE_MAXNAMELEN];
compat_uint_t size;
struct compat_arpt_entry entrytable[0];
};
@@ -1823,7 +1828,7 @@ void arpt_unregister_table(struct xt_table *table)
/* The built-in targets: standard (NULL) and error. */
static struct xt_target arpt_builtin_tg[] __read_mostly = {
{
- .name = ARPT_STANDARD_TARGET,
+ .name = XT_STANDARD_TARGET,
.targetsize = sizeof(int),
.family = NFPROTO_ARP,
#ifdef CONFIG_COMPAT
@@ -1833,9 +1838,9 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = {
#endif
},
{
- .name = ARPT_ERROR_TARGET,
+ .name = XT_ERROR_TARGET,
.target = arpt_error,
- .targetsize = ARPT_FUNCTION_MAXNAMELEN,
+ .targetsize = XT_FUNCTION_MAXNAMELEN,
.family = NFPROTO_ARP,
},
};
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index e1be7dd1171..b8ddcc480ed 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -63,7 +63,7 @@ static int checkentry(const struct xt_tgchk_param *par)
return false;
if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
- mangle->target != ARPT_CONTINUE)
+ mangle->target != XT_CONTINUE)
return false;
return true;
}
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index c439721b165..d31b007a6d8 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -186,7 +186,7 @@ static inline bool unconditional(const struct ipt_ip *ip)
}
/* for const-correctness */
-static inline const struct ipt_entry_target *
+static inline const struct xt_entry_target *
ipt_get_target_c(const struct ipt_entry *e)
{
return ipt_get_target((struct ipt_entry *)e);
@@ -230,9 +230,9 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
const char *hookname, const char **chainname,
const char **comment, unsigned int *rulenum)
{
- const struct ipt_standard_target *t = (void *)ipt_get_target_c(s);
+ const struct xt_standard_target *t = (void *)ipt_get_target_c(s);
- if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) {
+ if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
/* Head of user chain: ERROR target with chainname */
*chainname = t->target.data;
(*rulenum) = 0;
@@ -241,7 +241,7 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
if (s->target_offset == sizeof(struct ipt_entry) &&
strcmp(t->target.u.kernel.target->name,
- IPT_STANDARD_TARGET) == 0 &&
+ XT_STANDARD_TARGET) == 0 &&
t->verdict < 0 &&
unconditional(&s->ip)) {
/* Tail of chains: STANDARD target (return/policy) */
@@ -346,7 +346,7 @@ ipt_do_table(struct sk_buff *skb,
get_entry(table_base, private->underflow[hook]));
do {
- const struct ipt_entry_target *t;
+ const struct xt_entry_target *t;
const struct xt_entry_match *ematch;
IP_NF_ASSERT(e);
@@ -380,10 +380,10 @@ ipt_do_table(struct sk_buff *skb,
if (!t->u.kernel.target->target) {
int v;
- v = ((struct ipt_standard_target *)t)->verdict;
+ v = ((struct xt_standard_target *)t)->verdict;
if (v < 0) {
/* Pop from stack? */
- if (v != IPT_RETURN) {
+ if (v != XT_RETURN) {
verdict = (unsigned)(-v) - 1;
break;
}
@@ -421,7 +421,7 @@ ipt_do_table(struct sk_buff *skb,
verdict = t->u.kernel.target->target(skb, &acpar);
/* Target might have changed stuff. */
ip = ip_hdr(skb);
- if (verdict == IPT_CONTINUE)
+ if (verdict == XT_CONTINUE)
e = ipt_next_entry(e);
else
/* Verdict */
@@ -461,7 +461,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
e->counters.pcnt = pos;
for (;;) {
- const struct ipt_standard_target *t
+ const struct xt_standard_target *t
= (void *)ipt_get_target_c(e);
int visited = e->comefrom & (1 << hook);
@@ -475,13 +475,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
/* Unconditional return/END. */
if ((e->target_offset == sizeof(struct ipt_entry) &&
(strcmp(t->target.u.user.name,
- IPT_STANDARD_TARGET) == 0) &&
+ XT_STANDARD_TARGET) == 0) &&
t->verdict < 0 && unconditional(&e->ip)) ||
visited) {
unsigned int oldpos, size;
if ((strcmp(t->target.u.user.name,
- IPT_STANDARD_TARGET) == 0) &&
+ XT_STANDARD_TARGET) == 0) &&
t->verdict < -NF_MAX_VERDICT - 1) {
duprintf("mark_source_chains: bad "
"negative verdict (%i)\n",
@@ -524,7 +524,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
int newpos = t->verdict;
if (strcmp(t->target.u.user.name,
- IPT_STANDARD_TARGET) == 0 &&
+ XT_STANDARD_TARGET) == 0 &&
newpos >= 0) {
if (newpos > newinfo->size -
sizeof(struct ipt_entry)) {
@@ -552,7 +552,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
return 1;
}
-static void cleanup_match(struct ipt_entry_match *m, struct net *net)
+static void cleanup_match(struct xt_entry_match *m, struct net *net)
{
struct xt_mtdtor_param par;
@@ -568,14 +568,14 @@ static void cleanup_match(struct ipt_entry_match *m, struct net *net)
static int
check_entry(const struct ipt_entry *e, const char *name)
{
- const struct ipt_entry_target *t;
+ const struct xt_entry_target *t;
if (!ip_checkentry(&e->ip)) {
duprintf("ip check failed %p %s.\n", e, par->match->name);
return -EINVAL;
}
- if (e->target_offset + sizeof(struct ipt_entry_target) >
+ if (e->target_offset + sizeof(struct xt_entry_target) >
e->next_offset)
return -EINVAL;
@@ -587,7 +587,7 @@ check_entry(const struct ipt_entry *e, const char *name)
}
static int
-check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
+check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
const struct ipt_ip *ip = par->entryinfo;
int ret;
@@ -605,7 +605,7 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
}
static int
-find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
+find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
struct xt_match *match;
int ret;
@@ -630,7 +630,7 @@ err:
static int check_target(struct ipt_entry *e, struct net *net, const char *name)
{
- struct ipt_entry_target *t = ipt_get_target(e);
+ struct xt_entry_target *t = ipt_get_target(e);
struct xt_tgchk_param par = {
.net = net,
.table = name,
@@ -656,7 +656,7 @@ static int
find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
unsigned int size)
{
- struct ipt_entry_target *t;
+ struct xt_entry_target *t;
struct xt_target *target;
int ret;
unsigned int j;
@@ -707,7 +707,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
static bool check_underflow(const struct ipt_entry *e)
{
- const struct ipt_entry_target *t;
+ const struct xt_entry_target *t;
unsigned int verdict;
if (!unconditional(&e->ip))
@@ -715,7 +715,7 @@ static bool check_underflow(const struct ipt_entry *e)
t = ipt_get_target_c(e);
if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
return false;
- verdict = ((struct ipt_standard_target *)t)->verdict;
+ verdict = ((struct xt_standard_target *)t)->verdict;
verdict = -verdict - 1;
return verdict == NF_DROP || verdict == NF_ACCEPT;
}
@@ -738,7 +738,7 @@ check_entry_size_and_hooks(struct ipt_entry *e,
}
if (e->next_offset
- < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) {
+ < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) {
duprintf("checking: element %p size %u\n",
e, e->next_offset);
return -EINVAL;
@@ -771,7 +771,7 @@ static void
cleanup_entry(struct ipt_entry *e, struct net *net)
{
struct xt_tgdtor_param par;
- struct ipt_entry_target *t;
+ struct xt_entry_target *t;
struct xt_entry_match *ematch;
/* Cleanup all matches */
@@ -909,6 +909,7 @@ get_counters(const struct xt_table_info *t,
if (cpu == curcpu)
continue;
i = 0;
+ local_bh_disable();
xt_info_wrlock(cpu);
xt_entry_foreach(iter, t->entries[cpu], t->size) {
ADD_COUNTER(counters[i], iter->counters.bcnt,
@@ -916,6 +917,7 @@ get_counters(const struct xt_table_info *t,
++i; /* macro does multi eval of i */
}
xt_info_wrunlock(cpu);
+ local_bh_enable();
}
put_cpu();
}
@@ -970,8 +972,8 @@ copy_entries_to_user(unsigned int total_size,
/* ... then go back and fix counters and names */
for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
unsigned int i;
- const struct ipt_entry_match *m;
- const struct ipt_entry_target *t;
+ const struct xt_entry_match *m;
+ const struct xt_entry_target *t;
e = (struct ipt_entry *)(loc_cpu_entry + off);
if (copy_to_user(userptr + off
@@ -988,7 +990,7 @@ copy_entries_to_user(unsigned int total_size,
m = (void *)e + i;
if (copy_to_user(userptr + off + i
- + offsetof(struct ipt_entry_match,
+ + offsetof(struct xt_entry_match,
u.user.name),
m->u.kernel.match->name,
strlen(m->u.kernel.match->name)+1)
@@ -1000,7 +1002,7 @@ copy_entries_to_user(unsigned int total_size,
t = ipt_get_target_c(e);
if (copy_to_user(userptr + off + e->target_offset
- + offsetof(struct ipt_entry_target,
+ + offsetof(struct xt_entry_target,
u.user.name),
t->u.kernel.target->name,
strlen(t->u.kernel.target->name)+1) != 0) {
@@ -1038,7 +1040,7 @@ static int compat_calc_entry(const struct ipt_entry *e,
const void *base, struct xt_table_info *newinfo)
{
const struct xt_entry_match *ematch;
- const struct ipt_entry_target *t;
+ const struct xt_entry_target *t;
unsigned int entry_offset;
int off, i, ret;
@@ -1090,7 +1092,7 @@ static int compat_table_info(const struct xt_table_info *info,
static int get_info(struct net *net, void __user *user,
const int *len, int compat)
{
- char name[IPT_TABLE_MAXNAMELEN];
+ char name[XT_TABLE_MAXNAMELEN];
struct xt_table *t;
int ret;
@@ -1103,7 +1105,7 @@ static int get_info(struct net *net, void __user *user,
if (copy_from_user(name, user, sizeof(name)) != 0)
return -EFAULT;
- name[IPT_TABLE_MAXNAMELEN-1] = '\0';
+ name[XT_TABLE_MAXNAMELEN-1] = '\0';
#ifdef CONFIG_COMPAT
if (compat)
xt_compat_lock(AF_INET);
@@ -1398,14 +1400,14 @@ do_add_counters(struct net *net, const void __user *user,
#ifdef CONFIG_COMPAT
struct compat_ipt_replace {
- char name[IPT_TABLE_MAXNAMELEN];
+ char name[XT_TABLE_MAXNAMELEN];
u32 valid_hooks;
u32 num_entries;
u32 size;
u32 hook_entry[NF_INET_NUMHOOKS];
u32 underflow[NF_INET_NUMHOOKS];
u32 num_counters;
- compat_uptr_t counters; /* struct ipt_counters * */
+ compat_uptr_t counters; /* struct xt_counters * */
struct compat_ipt_entry entries[0];
};
@@ -1414,7 +1416,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
unsigned int *size, struct xt_counters *counters,
unsigned int i)
{
- struct ipt_entry_target *t;
+ struct xt_entry_target *t;
struct compat_ipt_entry __user *ce;
u_int16_t target_offset, next_offset;
compat_uint_t origsize;
@@ -1449,7 +1451,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
}
static int
-compat_find_calc_match(struct ipt_entry_match *m,
+compat_find_calc_match(struct xt_entry_match *m,
const char *name,
const struct ipt_ip *ip,
unsigned int hookmask,
@@ -1471,7 +1473,7 @@ compat_find_calc_match(struct ipt_entry_match *m,
static void compat_release_entry(struct compat_ipt_entry *e)
{
- struct ipt_entry_target *t;
+ struct xt_entry_target *t;
struct xt_entry_match *ematch;
/* Cleanup all matches */
@@ -1492,7 +1494,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
const char *name)
{
struct xt_entry_match *ematch;
- struct ipt_entry_target *t;
+ struct xt_entry_target *t;
struct xt_target *target;
unsigned int entry_offset;
unsigned int j;
@@ -1574,7 +1576,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
unsigned int *size, const char *name,
struct xt_table_info *newinfo, unsigned char *base)
{
- struct ipt_entry_target *t;
+ struct xt_entry_target *t;
struct xt_target *target;
struct ipt_entry *de;
unsigned int origsize;
@@ -1749,6 +1751,9 @@ translate_compat_table(struct net *net,
if (ret != 0)
break;
++i;
+ if (strcmp(ipt_get_target(iter1)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
}
if (ret) {
/*
@@ -1879,7 +1884,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
}
struct compat_ipt_get_entries {
- char name[IPT_TABLE_MAXNAMELEN];
+ char name[XT_TABLE_MAXNAMELEN];
compat_uint_t size;
struct compat_ipt_entry entrytable[0];
};
@@ -2034,7 +2039,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
case IPT_SO_GET_REVISION_MATCH:
case IPT_SO_GET_REVISION_TARGET: {
- struct ipt_get_revision rev;
+ struct xt_get_revision rev;
int target;
if (*len != sizeof(rev)) {
@@ -2171,7 +2176,7 @@ static int icmp_checkentry(const struct xt_mtchk_param *par)
static struct xt_target ipt_builtin_tg[] __read_mostly = {
{
- .name = IPT_STANDARD_TARGET,
+ .name = XT_STANDARD_TARGET,
.targetsize = sizeof(int),
.family = NFPROTO_IPV4,
#ifdef CONFIG_COMPAT
@@ -2181,9 +2186,9 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = {
#endif
},
{
- .name = IPT_ERROR_TARGET,
+ .name = XT_ERROR_TARGET,
.target = ipt_error,
- .targetsize = IPT_FUNCTION_MAXNAMELEN,
+ .targetsize = XT_FUNCTION_MAXNAMELEN,
.family = NFPROTO_IPV4,
},
};
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 3a43cf36db8..1e26a489765 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -29,6 +29,7 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/net_namespace.h>
#include <net/checksum.h>
+#include <net/ip.h>
#define CLUSTERIP_VERSION "0.8"
@@ -231,24 +232,22 @@ clusterip_hashfn(const struct sk_buff *skb,
{
const struct iphdr *iph = ip_hdr(skb);
unsigned long hashval;
- u_int16_t sport, dport;
- const u_int16_t *ports;
-
- switch (iph->protocol) {
- case IPPROTO_TCP:
- case IPPROTO_UDP:
- case IPPROTO_UDPLITE:
- case IPPROTO_SCTP:
- case IPPROTO_DCCP:
- case IPPROTO_ICMP:
- ports = (const void *)iph+iph->ihl*4;
- sport = ports[0];
- dport = ports[1];
- break;
- default:
+ u_int16_t sport = 0, dport = 0;
+ int poff;
+
+ poff = proto_ports_offset(iph->protocol);
+ if (poff >= 0) {
+ const u_int16_t *ports;
+ u16 _ports[2];
+
+ ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports);
+ if (ports) {
+ sport = ports[0];
+ dport = ports[1];
+ }
+ } else {
if (net_ratelimit())
pr_info("unknown protocol %u\n", iph->protocol);
- sport = dport = 0;
}
switch (config->hash_mode) {
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 915fc17d7ce..72ffc8fda2e 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -24,16 +24,15 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ipt_LOG.h>
#include <net/netfilter/nf_log.h>
+#include <net/netfilter/xt_log.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog");
-/* Use lock to serialize, so printks don't overlap */
-static DEFINE_SPINLOCK(log_lock);
-
/* One level of recursion won't kill us */
-static void dump_packet(const struct nf_loginfo *info,
+static void dump_packet(struct sbuff *m,
+ const struct nf_loginfo *info,
const struct sk_buff *skb,
unsigned int iphoff)
{
@@ -48,32 +47,32 @@ static void dump_packet(const struct nf_loginfo *info,
ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
if (ih == NULL) {
- printk("TRUNCATED");
+ sb_add(m, "TRUNCATED");
return;
}
/* Important fields:
* TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
/* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
- printk("SRC=%pI4 DST=%pI4 ",
+ sb_add(m, "SRC=%pI4 DST=%pI4 ",
&ih->saddr, &ih->daddr);
/* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
- printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+ sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
/* Max length: 6 "CE DF MF " */
if (ntohs(ih->frag_off) & IP_CE)
- printk("CE ");
+ sb_add(m, "CE ");
if (ntohs(ih->frag_off) & IP_DF)
- printk("DF ");
+ sb_add(m, "DF ");
if (ntohs(ih->frag_off) & IP_MF)
- printk("MF ");
+ sb_add(m, "MF ");
/* Max length: 11 "FRAG:65535 " */
if (ntohs(ih->frag_off) & IP_OFFSET)
- printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+ sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
if ((logflags & IPT_LOG_IPOPT) &&
ih->ihl * 4 > sizeof(struct iphdr)) {
@@ -85,15 +84,15 @@ static void dump_packet(const struct nf_loginfo *info,
op = skb_header_pointer(skb, iphoff+sizeof(_iph),
optsize, _opt);
if (op == NULL) {
- printk("TRUNCATED");
+ sb_add(m, "TRUNCATED");
return;
}
/* Max length: 127 "OPT (" 15*4*2chars ") " */
- printk("OPT (");
+ sb_add(m, "OPT (");
for (i = 0; i < optsize; i++)
- printk("%02X", op[i]);
- printk(") ");
+ sb_add(m, "%02X", op[i]);
+ sb_add(m, ") ");
}
switch (ih->protocol) {
@@ -102,7 +101,7 @@ static void dump_packet(const struct nf_loginfo *info,
const struct tcphdr *th;
/* Max length: 10 "PROTO=TCP " */
- printk("PROTO=TCP ");
+ sb_add(m, "PROTO=TCP ");
if (ntohs(ih->frag_off) & IP_OFFSET)
break;
@@ -111,41 +110,41 @@ static void dump_packet(const struct nf_loginfo *info,
th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
sizeof(_tcph), &_tcph);
if (th == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ sb_add(m, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Max length: 20 "SPT=65535 DPT=65535 " */
- printk("SPT=%u DPT=%u ",
+ sb_add(m, "SPT=%u DPT=%u ",
ntohs(th->source), ntohs(th->dest));
/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
if (logflags & IPT_LOG_TCPSEQ)
- printk("SEQ=%u ACK=%u ",
+ sb_add(m, "SEQ=%u ACK=%u ",
ntohl(th->seq), ntohl(th->ack_seq));
/* Max length: 13 "WINDOW=65535 " */
- printk("WINDOW=%u ", ntohs(th->window));
+ sb_add(m, "WINDOW=%u ", ntohs(th->window));
/* Max length: 9 "RES=0x3F " */
- printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+ sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
if (th->cwr)
- printk("CWR ");
+ sb_add(m, "CWR ");
if (th->ece)
- printk("ECE ");
+ sb_add(m, "ECE ");
if (th->urg)
- printk("URG ");
+ sb_add(m, "URG ");
if (th->ack)
- printk("ACK ");
+ sb_add(m, "ACK ");
if (th->psh)
- printk("PSH ");
+ sb_add(m, "PSH ");
if (th->rst)
- printk("RST ");
+ sb_add(m, "RST ");
if (th->syn)
- printk("SYN ");
+ sb_add(m, "SYN ");
if (th->fin)
- printk("FIN ");
+ sb_add(m, "FIN ");
/* Max length: 11 "URGP=65535 " */
- printk("URGP=%u ", ntohs(th->urg_ptr));
+ sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
if ((logflags & IPT_LOG_TCPOPT) &&
th->doff * 4 > sizeof(struct tcphdr)) {
@@ -158,15 +157,15 @@ static void dump_packet(const struct nf_loginfo *info,
iphoff+ih->ihl*4+sizeof(_tcph),
optsize, _opt);
if (op == NULL) {
- printk("TRUNCATED");
+ sb_add(m, "TRUNCATED");
return;
}
/* Max length: 127 "OPT (" 15*4*2chars ") " */
- printk("OPT (");
+ sb_add(m, "OPT (");
for (i = 0; i < optsize; i++)
- printk("%02X", op[i]);
- printk(") ");
+ sb_add(m, "%02X", op[i]);
+ sb_add(m, ") ");
}
break;
}
@@ -177,9 +176,9 @@ static void dump_packet(const struct nf_loginfo *info,
if (ih->protocol == IPPROTO_UDP)
/* Max length: 10 "PROTO=UDP " */
- printk("PROTO=UDP " );
+ sb_add(m, "PROTO=UDP " );
else /* Max length: 14 "PROTO=UDPLITE " */
- printk("PROTO=UDPLITE ");
+ sb_add(m, "PROTO=UDPLITE ");
if (ntohs(ih->frag_off) & IP_OFFSET)
break;
@@ -188,13 +187,13 @@ static void dump_packet(const struct nf_loginfo *info,
uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
sizeof(_udph), &_udph);
if (uh == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ sb_add(m, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Max length: 20 "SPT=65535 DPT=65535 " */
- printk("SPT=%u DPT=%u LEN=%u ",
+ sb_add(m, "SPT=%u DPT=%u LEN=%u ",
ntohs(uh->source), ntohs(uh->dest),
ntohs(uh->len));
break;
@@ -221,7 +220,7 @@ static void dump_packet(const struct nf_loginfo *info,
[ICMP_ADDRESSREPLY] = 12 };
/* Max length: 11 "PROTO=ICMP " */
- printk("PROTO=ICMP ");
+ sb_add(m, "PROTO=ICMP ");
if (ntohs(ih->frag_off) & IP_OFFSET)
break;
@@ -230,19 +229,19 @@ static void dump_packet(const struct nf_loginfo *info,
ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
sizeof(_icmph), &_icmph);
if (ich == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ sb_add(m, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Max length: 18 "TYPE=255 CODE=255 " */
- printk("TYPE=%u CODE=%u ", ich->type, ich->code);
+ sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
if (ich->type <= NR_ICMP_TYPES &&
required_len[ich->type] &&
skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
- printk("INCOMPLETE [%u bytes] ",
+ sb_add(m, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
@@ -251,35 +250,35 @@ static void dump_packet(const struct nf_loginfo *info,
case ICMP_ECHOREPLY:
case ICMP_ECHO:
/* Max length: 19 "ID=65535 SEQ=65535 " */
- printk("ID=%u SEQ=%u ",
+ sb_add(m, "ID=%u SEQ=%u ",
ntohs(ich->un.echo.id),
ntohs(ich->un.echo.sequence));
break;
case ICMP_PARAMETERPROB:
/* Max length: 14 "PARAMETER=255 " */
- printk("PARAMETER=%u ",
+ sb_add(m, "PARAMETER=%u ",
ntohl(ich->un.gateway) >> 24);
break;
case ICMP_REDIRECT:
/* Max length: 24 "GATEWAY=255.255.255.255 " */
- printk("GATEWAY=%pI4 ", &ich->un.gateway);
+ sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
/* Fall through */
case ICMP_DEST_UNREACH:
case ICMP_SOURCE_QUENCH:
case ICMP_TIME_EXCEEDED:
/* Max length: 3+maxlen */
if (!iphoff) { /* Only recurse once. */
- printk("[");
- dump_packet(info, skb,
+ sb_add(m, "[");
+ dump_packet(m, info, skb,
iphoff + ih->ihl*4+sizeof(_icmph));
- printk("] ");
+ sb_add(m, "] ");
}
/* Max length: 10 "MTU=65535 " */
if (ich->type == ICMP_DEST_UNREACH &&
ich->code == ICMP_FRAG_NEEDED)
- printk("MTU=%u ", ntohs(ich->un.frag.mtu));
+ sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu));
}
break;
}
@@ -292,19 +291,19 @@ static void dump_packet(const struct nf_loginfo *info,
break;
/* Max length: 9 "PROTO=AH " */
- printk("PROTO=AH ");
+ sb_add(m, "PROTO=AH ");
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
sizeof(_ahdr), &_ahdr);
if (ah == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ sb_add(m, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Length: 15 "SPI=0xF1234567 " */
- printk("SPI=0x%x ", ntohl(ah->spi));
+ sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
break;
}
case IPPROTO_ESP: {
@@ -312,7 +311,7 @@ static void dump_packet(const struct nf_loginfo *info,
const struct ip_esp_hdr *eh;
/* Max length: 10 "PROTO=ESP " */
- printk("PROTO=ESP ");
+ sb_add(m, "PROTO=ESP ");
if (ntohs(ih->frag_off) & IP_OFFSET)
break;
@@ -321,25 +320,25 @@ static void dump_packet(const struct nf_loginfo *info,
eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
sizeof(_esph), &_esph);
if (eh == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ sb_add(m, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Length: 15 "SPI=0xF1234567 " */
- printk("SPI=0x%x ", ntohl(eh->spi));
+ sb_add(m, "SPI=0x%x ", ntohl(eh->spi));
break;
}
/* Max length: 10 "PROTO 255 " */
default:
- printk("PROTO=%u ", ih->protocol);
+ sb_add(m, "PROTO=%u ", ih->protocol);
}
/* Max length: 15 "UID=4294967295 " */
if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
read_lock_bh(&skb->sk->sk_callback_lock);
if (skb->sk->sk_socket && skb->sk->sk_socket->file)
- printk("UID=%u GID=%u ",
+ sb_add(m, "UID=%u GID=%u ",
skb->sk->sk_socket->file->f_cred->fsuid,
skb->sk->sk_socket->file->f_cred->fsgid);
read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -347,7 +346,7 @@ static void dump_packet(const struct nf_loginfo *info,
/* Max length: 16 "MARK=0xFFFFFFFF " */
if (!iphoff && skb->mark)
- printk("MARK=0x%x ", skb->mark);
+ sb_add(m, "MARK=0x%x ", skb->mark);
/* Proto Max log string length */
/* IP: 40+46+6+11+127 = 230 */
@@ -364,7 +363,8 @@ static void dump_packet(const struct nf_loginfo *info,
/* maxlen = 230+ 91 + 230 + 252 = 803 */
}
-static void dump_mac_header(const struct nf_loginfo *info,
+static void dump_mac_header(struct sbuff *m,
+ const struct nf_loginfo *info,
const struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
@@ -378,7 +378,7 @@ static void dump_mac_header(const struct nf_loginfo *info,
switch (dev->type) {
case ARPHRD_ETHER:
- printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+ sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
ntohs(eth_hdr(skb)->h_proto));
return;
@@ -387,17 +387,17 @@ static void dump_mac_header(const struct nf_loginfo *info,
}
fallback:
- printk("MAC=");
+ sb_add(m, "MAC=");
if (dev->hard_header_len &&
skb->mac_header != skb->network_header) {
const unsigned char *p = skb_mac_header(skb);
unsigned int i;
- printk("%02x", *p++);
+ sb_add(m, "%02x", *p++);
for (i = 1; i < dev->hard_header_len; i++, p++)
- printk(":%02x", *p);
+ sb_add(m, ":%02x", *p);
}
- printk(" ");
+ sb_add(m, " ");
}
static struct nf_loginfo default_loginfo = {
@@ -419,11 +419,12 @@ ipt_log_packet(u_int8_t pf,
const struct nf_loginfo *loginfo,
const char *prefix)
{
+ struct sbuff *m = sb_open();
+
if (!loginfo)
loginfo = &default_loginfo;
- spin_lock_bh(&log_lock);
- printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+ sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
prefix,
in ? in->name : "",
out ? out->name : "");
@@ -434,20 +435,20 @@ ipt_log_packet(u_int8_t pf,
physindev = skb->nf_bridge->physindev;
if (physindev && in != physindev)
- printk("PHYSIN=%s ", physindev->name);
+ sb_add(m, "PHYSIN=%s ", physindev->name);
physoutdev = skb->nf_bridge->physoutdev;
if (physoutdev && out != physoutdev)
- printk("PHYSOUT=%s ", physoutdev->name);
+ sb_add(m, "PHYSOUT=%s ", physoutdev->name);
}
#endif
/* MAC logging for input path only. */
if (in && !out)
- dump_mac_header(loginfo, skb);
+ dump_mac_header(m, loginfo, skb);
+
+ dump_packet(m, loginfo, skb, 0);
- dump_packet(loginfo, skb, 0);
- printk("\n");
- spin_unlock_bh(&log_lock);
+ sb_close(m);
}
static unsigned int
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index b254dafaf42..43eec80c0e7 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -112,6 +112,7 @@ static void send_reset(struct sk_buff *oldskb, int hook)
/* ip_route_me_harder expects skb->dst to be set */
skb_dst_set_noref(nskb, skb_dst(oldskb));
+ nskb->protocol = htons(ETH_P_IP);
if (ip_route_me_harder(nskb, addr_type))
goto free_nskb;
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 244f7cb08d6..37f8adb68c7 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -11,6 +11,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/percpu.h>
+#include <linux/security.h>
#include <net/net_namespace.h>
#include <linux/netfilter.h>
@@ -87,6 +88,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
rcu_read_unlock();
}
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+ int ret;
+ u32 len;
+ char *secctx;
+
+ ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+ if (ret)
+ return ret;
+
+ ret = seq_printf(s, "secctx=%s ", secctx);
+
+ security_release_secctx(secctx, len);
+ return ret;
+}
+#else
+static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+ return 0;
+}
+#endif
+
static int ct_seq_show(struct seq_file *s, void *v)
{
struct nf_conntrack_tuple_hash *hash = v;
@@ -148,10 +172,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
goto release;
#endif
-#ifdef CONFIG_NF_CONNTRACK_SECMARK
- if (seq_printf(s, "secmark=%u ", ct->secmark))
+ if (ct_show_secctx(s, ct))
goto release;
-#endif
if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
goto release;
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index eab8de32f20..f3a9b42b16c 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -66,9 +66,11 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
+ struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(skb->sk);
- if (inet && inet->nodefrag)
+ if (sk && (sk->sk_family == PF_INET) &&
+ inet->nodefrag)
return NF_ACCEPT;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index c31b8766825..0f23b3f06df 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -44,9 +44,16 @@ static unsigned int help(struct sk_buff *skb,
/* Try to get same port: if not, try to change it. */
for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+ int ret;
+
exp->tuple.dst.u.tcp.port = htons(port);
- if (nf_ct_expect_related(exp) == 0)
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ port = 0;
break;
+ }
}
if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 8c8632d9b93..295c97431e4 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -38,7 +38,7 @@ static DEFINE_SPINLOCK(nf_nat_lock);
static struct nf_conntrack_l3proto *l3proto __read_mostly;
#define MAX_IP_NAT_PROTO 256
-static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]
+static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
__read_mostly;
static inline const struct nf_nat_protocol *
@@ -47,7 +47,7 @@ __nf_nat_proto_find(u_int8_t protonum)
return rcu_dereference(nf_nat_protos[protonum]);
}
-const struct nf_nat_protocol *
+static const struct nf_nat_protocol *
nf_nat_proto_find_get(u_int8_t protonum)
{
const struct nf_nat_protocol *p;
@@ -60,14 +60,12 @@ nf_nat_proto_find_get(u_int8_t protonum)
return p;
}
-EXPORT_SYMBOL_GPL(nf_nat_proto_find_get);
-void
+static void
nf_nat_proto_put(const struct nf_nat_protocol *p)
{
module_put(p->me);
}
-EXPORT_SYMBOL_GPL(nf_nat_proto_put);
/* We keep an extra hash for each conntrack, for fast searching. */
static inline unsigned int
@@ -262,11 +260,17 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
/* Only bother mapping if it's not already in range and unique */
- if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM) &&
- (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
- proto->in_range(tuple, maniptype, &range->min, &range->max)) &&
- !nf_nat_used_tuple(tuple, ct))
- goto out;
+ if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
+ if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) {
+ if (proto->in_range(tuple, maniptype, &range->min,
+ &range->max) &&
+ (range->min.all == range->max.all ||
+ !nf_nat_used_tuple(tuple, ct)))
+ goto out;
+ } else if (!nf_nat_used_tuple(tuple, ct)) {
+ goto out;
+ }
+ }
/* Last change: get protocol to try to obtain unique tuple. */
proto->unique_tuple(tuple, range, maniptype, ct);
@@ -458,6 +462,18 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
return 0;
}
+ if (manip == IP_NAT_MANIP_SRC)
+ statusbit = IPS_SRC_NAT;
+ else
+ statusbit = IPS_DST_NAT;
+
+ /* Invert if this is reply dir. */
+ if (dir == IP_CT_DIR_REPLY)
+ statusbit ^= IPS_NAT_MASK;
+
+ if (!(ct->status & statusbit))
+ return 1;
+
pr_debug("icmp_reply_translation: translating error %p manip %u "
"dir %s\n", skb, manip,
dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
@@ -492,20 +508,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
/* Change outer to look the reply to an incoming packet
* (proto 0 means don't invert per-proto part). */
- if (manip == IP_NAT_MANIP_SRC)
- statusbit = IPS_SRC_NAT;
- else
- statusbit = IPS_DST_NAT;
-
- /* Invert if this is reply dir. */
- if (dir == IP_CT_DIR_REPLY)
- statusbit ^= IPS_NAT_MASK;
-
- if (ct->status & statusbit) {
- nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
- if (!manip_pkt(0, skb, 0, &target, manip))
- return 0;
- }
+ nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+ if (!manip_pkt(0, skb, 0, &target, manip))
+ return 0;
return 1;
}
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
index 86e0e84ff0a..dc73abb3fe2 100644
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ b/net/ipv4/netfilter/nf_nat_ftp.c
@@ -79,9 +79,16 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
/* Try to get same port: if not, try to change it. */
for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+ int ret;
+
exp->tuple.dst.u.tcp.port = htons(port);
- if (nf_ct_expect_related(exp) == 0)
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ port = 0;
break;
+ }
}
if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 5045196d853..790f3160e01 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -222,13 +222,24 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
/* Try to get a pair of ports. */
for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
nated_port != 0; nated_port += 2) {
+ int ret;
+
rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
- if (nf_ct_expect_related(rtp_exp) == 0) {
+ ret = nf_ct_expect_related(rtp_exp);
+ if (ret == 0) {
rtcp_exp->tuple.dst.u.udp.port =
htons(nated_port + 1);
- if (nf_ct_expect_related(rtcp_exp) == 0)
+ ret = nf_ct_expect_related(rtcp_exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ nf_ct_unexpect_related(rtp_exp);
+ nated_port = 0;
break;
- nf_ct_unexpect_related(rtp_exp);
+ }
+ } else if (ret != -EBUSY) {
+ nated_port = 0;
+ break;
}
}
@@ -284,9 +295,16 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
/* Try to get same port: if not, try to change it. */
for (; nated_port != 0; nated_port++) {
+ int ret;
+
exp->tuple.dst.u.tcp.port = htons(nated_port);
- if (nf_ct_expect_related(exp) == 0)
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ nated_port = 0;
break;
+ }
}
if (nated_port == 0) { /* No port available */
@@ -334,9 +352,16 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
/* Try to get same port: if not, try to change it. */
for (; nated_port != 0; nated_port++) {
+ int ret;
+
exp->tuple.dst.u.tcp.port = htons(nated_port);
- if (nf_ct_expect_related(exp) == 0)
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
break;
+ else if (ret != -EBUSY) {
+ nated_port = 0;
+ break;
+ }
}
if (nated_port == 0) { /* No port available */
@@ -418,9 +443,16 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
/* Try to get same port: if not, try to change it. */
for (; nated_port != 0; nated_port++) {
+ int ret;
+
exp->tuple.dst.u.tcp.port = htons(nated_port);
- if (nf_ct_expect_related(exp) == 0)
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ nated_port = 0;
break;
+ }
}
if (nated_port == 0) { /* No port available */
@@ -500,9 +532,16 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
/* Try to get same port: if not, try to change it. */
for (nated_port = ntohs(port); nated_port != 0; nated_port++) {
+ int ret;
+
exp->tuple.dst.u.tcp.port = htons(nated_port);
- if (nf_ct_expect_related(exp) == 0)
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
break;
+ else if (ret != -EBUSY) {
+ nated_port = 0;
+ break;
+ }
}
if (nated_port == 0) { /* No port available */
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 4a0c6b548ee..31427fb57aa 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -153,6 +153,35 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
}
EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
+static void nf_nat_csum(struct sk_buff *skb, struct iphdr *iph, void *data,
+ int datalen, __sum16 *check, int oldlen)
+{
+ struct rtable *rt = skb_rtable(skb);
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ if (!(rt->rt_flags & RTCF_LOCAL) &&
+ skb->dev->features & NETIF_F_V4_CSUM) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_headroom(skb) +
+ skb_network_offset(skb) +
+ iph->ihl * 4;
+ skb->csum_offset = (void *)check - data;
+ *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+ datalen, iph->protocol, 0);
+ } else {
+ *check = 0;
+ *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ datalen, iph->protocol,
+ csum_partial(data, datalen,
+ 0));
+ if (iph->protocol == IPPROTO_UDP && !*check)
+ *check = CSUM_MANGLED_0;
+ }
+ } else
+ inet_proto_csum_replace2(check, skb,
+ htons(oldlen), htons(datalen), 1);
+}
+
/* Generic function for mangling variable-length address changes inside
* NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
* command in FTP).
@@ -169,7 +198,6 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
const char *rep_buffer,
unsigned int rep_len, bool adjust)
{
- struct rtable *rt = skb_rtable(skb);
struct iphdr *iph;
struct tcphdr *tcph;
int oldlen, datalen;
@@ -192,26 +220,7 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
match_offset, match_len, rep_buffer, rep_len);
datalen = skb->len - iph->ihl*4;
- if (skb->ip_summed != CHECKSUM_PARTIAL) {
- if (!(rt->rt_flags & RTCF_LOCAL) &&
- skb->dev->features & NETIF_F_V4_CSUM) {
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum_start = skb_headroom(skb) +
- skb_network_offset(skb) +
- iph->ihl * 4;
- skb->csum_offset = offsetof(struct tcphdr, check);
- tcph->check = ~tcp_v4_check(datalen,
- iph->saddr, iph->daddr, 0);
- } else {
- tcph->check = 0;
- tcph->check = tcp_v4_check(datalen,
- iph->saddr, iph->daddr,
- csum_partial(tcph,
- datalen, 0));
- }
- } else
- inet_proto_csum_replace2(&tcph->check, skb,
- htons(oldlen), htons(datalen), 1);
+ nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen);
if (adjust && rep_len != match_len)
nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
@@ -240,7 +249,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
const char *rep_buffer,
unsigned int rep_len)
{
- struct rtable *rt = skb_rtable(skb);
struct iphdr *iph;
struct udphdr *udph;
int datalen, oldlen;
@@ -274,29 +282,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
return 1;
- if (skb->ip_summed != CHECKSUM_PARTIAL) {
- if (!(rt->rt_flags & RTCF_LOCAL) &&
- skb->dev->features & NETIF_F_V4_CSUM) {
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum_start = skb_headroom(skb) +
- skb_network_offset(skb) +
- iph->ihl * 4;
- skb->csum_offset = offsetof(struct udphdr, check);
- udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
- datalen, IPPROTO_UDP,
- 0);
- } else {
- udph->check = 0;
- udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
- datalen, IPPROTO_UDP,
- csum_partial(udph,
- datalen, 0));
- if (!udph->check)
- udph->check = CSUM_MANGLED_0;
- }
- } else
- inet_proto_csum_replace2(&udph->check, skb,
- htons(oldlen), htons(datalen), 1);
+ nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen);
return 1;
}
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
index ea83a886b03..535e1a80235 100644
--- a/net/ipv4/netfilter/nf_nat_irc.c
+++ b/net/ipv4/netfilter/nf_nat_irc.c
@@ -45,9 +45,16 @@ static unsigned int help(struct sk_buff *skb,
/* Try to get same port: if not, try to change it. */
for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+ int ret;
+
exp->tuple.dst.u.tcp.port = htons(port);
- if (nf_ct_expect_related(exp) == 0)
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ port = 0;
break;
+ }
}
if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index ebbd319f62f..21c30426480 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -106,16 +106,15 @@ alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
{
/* Force range to this IP; let proto decide mapping for
per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
- Use reply in case it's already been mangled (eg local packet).
*/
- __be32 ip
- = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
- ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip
- : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
- struct nf_nat_range range
- = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } };
-
- pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, &ip);
+ struct nf_nat_range range;
+
+ range.flags = 0;
+ pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
+ HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ?
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
+
return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
}
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 11b538deaae..e40cf7816fd 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -307,9 +307,16 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
exp->expectfn = ip_nat_sip_expected;
for (; port != 0; port++) {
+ int ret;
+
exp->tuple.dst.u.udp.port = htons(port);
- if (nf_ct_expect_related(exp) == 0)
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ port = 0;
break;
+ }
}
if (port == 0)
@@ -480,13 +487,25 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
/* Try to get same pair of ports: if not, try to change them. */
for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
port != 0; port += 2) {
+ int ret;
+
rtp_exp->tuple.dst.u.udp.port = htons(port);
- if (nf_ct_expect_related(rtp_exp) != 0)
+ ret = nf_ct_expect_related(rtp_exp);
+ if (ret == -EBUSY)
continue;
+ else if (ret < 0) {
+ port = 0;
+ break;
+ }
rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
- if (nf_ct_expect_related(rtcp_exp) == 0)
+ ret = nf_ct_expect_related(rtcp_exp);
+ if (ret == 0)
break;
- nf_ct_unexpect_related(rtp_exp);
+ else if (ret != -EBUSY) {
+ nf_ct_unexpect_related(rtp_exp);
+ port = 0;
+ break;
+ }
}
if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 1679e2c0963..ee5f419d0a5 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -893,13 +893,15 @@ static void fast_csum(__sum16 *csum,
unsigned char s[4];
if (offset & 1) {
- s[0] = s[2] = 0;
+ s[0] = ~0;
s[1] = ~*optr;
+ s[2] = 0;
s[3] = *nptr;
} else {
- s[1] = s[3] = 0;
s[0] = ~*optr;
+ s[1] = ~0;
s[2] = *nptr;
+ s[3] = 0;
}
*csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum)));
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index f2d29735140..65699c24411 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -28,8 +28,7 @@
#include <linux/spinlock.h>
#include <net/protocol.h>
-const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp;
-static DEFINE_SPINLOCK(inet_proto_lock);
+const struct net_protocol *inet_protos[MAX_INET_PROTOS] __read_mostly;
/*
* Add a protocol handler to the hash tables
@@ -37,20 +36,9 @@ static DEFINE_SPINLOCK(inet_proto_lock);
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
- int hash, ret;
+ int hash = protocol & (MAX_INET_PROTOS - 1);
- hash = protocol & (MAX_INET_PROTOS - 1);
-
- spin_lock_bh(&inet_proto_lock);
- if (inet_protos[hash]) {
- ret = -1;
- } else {
- inet_protos[hash] = prot;
- ret = 0;
- }
- spin_unlock_bh(&inet_proto_lock);
-
- return ret;
+ return !cmpxchg(&inet_protos[hash], NULL, prot) ? 0 : -1;
}
EXPORT_SYMBOL(inet_add_protocol);
@@ -60,18 +48,9 @@ EXPORT_SYMBOL(inet_add_protocol);
int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
{
- int hash, ret;
-
- hash = protocol & (MAX_INET_PROTOS - 1);
+ int ret, hash = protocol & (MAX_INET_PROTOS - 1);
- spin_lock_bh(&inet_proto_lock);
- if (inet_protos[hash] == prot) {
- inet_protos[hash] = NULL;
- ret = 0;
- } else {
- ret = -1;
- }
- spin_unlock_bh(&inet_proto_lock);
+ ret = (cmpxchg(&inet_protos[hash], prot, NULL) == prot) ? 0 : -1;
synchronize_net();
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 009a7b2aa1e..1f85ef28989 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -505,7 +505,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.addr = inet->inet_saddr;
ipc.opt = NULL;
- ipc.shtx.flags = 0;
+ ipc.tx_flags = 0;
ipc.oif = sk->sk_bound_dev_if;
if (msg->msg_controllen) {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3f56b6e6c6a..d6cb2bfcd8e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -159,7 +159,6 @@ static struct dst_ops ipv4_dst_ops = {
.link_failure = ipv4_link_failure,
.update_pmtu = ip_rt_update_pmtu,
.local_out = __ip_local_out,
- .entries = ATOMIC_INIT(0),
};
#define ECN_OR_COST(class) TC_PRIO_##class
@@ -466,7 +465,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
" %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
- atomic_read(&ipv4_dst_ops.entries),
+ dst_entries_get_slow(&ipv4_dst_ops),
st->in_hit,
st->in_slow_tot,
st->in_slow_mc,
@@ -945,6 +944,7 @@ static int rt_garbage_collect(struct dst_ops *ops)
struct rtable *rth, **rthp;
unsigned long now = jiffies;
int goal;
+ int entries = dst_entries_get_fast(&ipv4_dst_ops);
/*
* Garbage collection is pretty expensive,
@@ -954,28 +954,28 @@ static int rt_garbage_collect(struct dst_ops *ops)
RT_CACHE_STAT_INC(gc_total);
if (now - last_gc < ip_rt_gc_min_interval &&
- atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
+ entries < ip_rt_max_size) {
RT_CACHE_STAT_INC(gc_ignored);
goto out;
}
+ entries = dst_entries_get_slow(&ipv4_dst_ops);
/* Calculate number of entries, which we want to expire now. */
- goal = atomic_read(&ipv4_dst_ops.entries) -
- (ip_rt_gc_elasticity << rt_hash_log);
+ goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
if (goal <= 0) {
if (equilibrium < ipv4_dst_ops.gc_thresh)
equilibrium = ipv4_dst_ops.gc_thresh;
- goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+ goal = entries - equilibrium;
if (goal > 0) {
equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
- goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+ goal = entries - equilibrium;
}
} else {
/* We are in dangerous area. Try to reduce cache really
* aggressively.
*/
goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
- equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
+ equilibrium = entries - goal;
}
if (now - last_gc >= ip_rt_gc_min_interval)
@@ -1032,14 +1032,16 @@ static int rt_garbage_collect(struct dst_ops *ops)
expire >>= 1;
#if RT_CACHE_DEBUG >= 2
printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
- atomic_read(&ipv4_dst_ops.entries), goal, i);
+ dst_entries_get_fast(&ipv4_dst_ops), goal, i);
#endif
- if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+ if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
goto out;
} while (!in_softirq() && time_before_eq(jiffies, now));
- if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+ if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
+ goto out;
+ if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
goto out;
if (net_ratelimit())
printk(KERN_WARNING "dst cache overflow\n");
@@ -1049,11 +1051,12 @@ static int rt_garbage_collect(struct dst_ops *ops)
work_done:
expire += ip_rt_gc_min_interval;
if (expire > ip_rt_gc_timeout ||
- atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
+ dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
+ dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
expire = ip_rt_gc_timeout;
#if RT_CACHE_DEBUG >= 2
printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
- atomic_read(&ipv4_dst_ops.entries), goal, rover);
+ dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
#endif
out: return 0;
}
@@ -1102,23 +1105,23 @@ restart:
* Note that we do rt_free on this new route entry, so that
* once its refcount hits zero, we are still able to reap it
* (Thanks Alexey)
- * Note also the rt_free uses call_rcu. We don't actually
- * need rcu protection here, this is just our path to get
- * on the route gc list.
+ * Note: To avoid expensive rcu stuff for this uncached dst,
+ * we set DST_NOCACHE so that dst_release() can free dst without
+ * waiting a grace period.
*/
+ rt->dst.flags |= DST_NOCACHE;
if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
int err = arp_bind_neighbour(&rt->dst);
if (err) {
if (net_ratelimit())
printk(KERN_WARNING
"Neighbour table failure & not caching routes.\n");
- rt_drop(rt);
+ ip_rt_put(rt);
return err;
}
}
- rt_free(rt);
goto skip_hashing;
}
@@ -1231,7 +1234,7 @@ restart:
}
if (net_ratelimit())
- printk(KERN_WARNING "Neighbour table overflow.\n");
+ printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
rt_drop(rt);
return -ENOBUFS;
}
@@ -1268,18 +1271,11 @@ skip_hashing:
void rt_bind_peer(struct rtable *rt, int create)
{
- static DEFINE_SPINLOCK(rt_peer_lock);
struct inet_peer *peer;
peer = inet_getpeer(rt->rt_dst, create);
- spin_lock_bh(&rt_peer_lock);
- if (rt->peer == NULL) {
- rt->peer = peer;
- peer = NULL;
- }
- spin_unlock_bh(&rt_peer_lock);
- if (peer)
+ if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
inet_putpeer(peer);
}
@@ -1779,12 +1775,15 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
if (rt->fl.iif == 0)
src = rt->rt_src;
- else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) {
- src = FIB_RES_PREFSRC(res);
- fib_res_put(&res);
- } else
- src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
+ else {
+ rcu_read_lock();
+ if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
+ src = FIB_RES_PREFSRC(res);
+ else
+ src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
RT_SCOPE_UNIVERSE);
+ rcu_read_unlock();
+ }
memcpy(addr, &src, 4);
}
@@ -2087,6 +2086,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
* Such approach solves two big problems:
* 1. Not simplex devices are handled properly.
* 2. IP spoofing attempts are filtered with 100% of guarantee.
+ * called with rcu_read_lock()
*/
static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2108,7 +2108,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
unsigned hash;
__be32 spec_dst;
int err = -EINVAL;
- int free_res = 0;
struct net * net = dev_net(dev);
/* IP on this device is disabled. */
@@ -2124,7 +2123,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
ipv4_is_loopback(saddr))
goto martian_source;
- if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
+ if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
goto brd_input;
/* Accept zero addresses only to limited broadcast;
@@ -2133,19 +2132,18 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (ipv4_is_zeronet(saddr))
goto martian_source;
- if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
- ipv4_is_loopback(daddr))
+ if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
goto martian_destination;
/*
* Now we are ready to route packet.
*/
- if ((err = fib_lookup(net, &fl, &res)) != 0) {
+ err = fib_lookup(net, &fl, &res);
+ if (err != 0) {
if (!IN_DEV_FORWARD(in_dev))
goto e_hostunreach;
goto no_route;
}
- free_res = 1;
RT_CACHE_STAT_INC(in_slow_tot);
@@ -2154,8 +2152,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (res.type == RTN_LOCAL) {
err = fib_validate_source(saddr, daddr, tos,
- net->loopback_dev->ifindex,
- dev, &spec_dst, &itag, skb->mark);
+ net->loopback_dev->ifindex,
+ dev, &spec_dst, &itag, skb->mark);
if (err < 0)
goto martian_source_keep_err;
if (err)
@@ -2170,9 +2168,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto martian_destination;
err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
-done:
- if (free_res)
- fib_res_put(&res);
out: return err;
brd_input:
@@ -2232,7 +2227,7 @@ local_input:
rth->rt_type = res.type;
hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
- goto done;
+ goto out;
no_route:
RT_CACHE_STAT_INC(in_no_route);
@@ -2255,21 +2250,21 @@ martian_destination:
e_hostunreach:
err = -EHOSTUNREACH;
- goto done;
+ goto out;
e_inval:
err = -EINVAL;
- goto done;
+ goto out;
e_nobufs:
err = -ENOBUFS;
- goto done;
+ goto out;
martian_source:
err = -EINVAL;
martian_source_keep_err:
ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
- goto done;
+ goto out;
}
int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2355,6 +2350,7 @@ skip_cache:
}
EXPORT_SYMBOL(ip_route_input_common);
+/* called with rcu_read_lock() */
static int __mkroute_output(struct rtable **result,
struct fib_result *res,
const struct flowi *fl,
@@ -2365,53 +2361,47 @@ static int __mkroute_output(struct rtable **result,
struct rtable *rth;
struct in_device *in_dev;
u32 tos = RT_FL_TOS(oldflp);
- int err = 0;
- if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
+ if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
return -EINVAL;
- if (fl->fl4_dst == htonl(0xFFFFFFFF))
+ if (ipv4_is_lbcast(fl->fl4_dst))
res->type = RTN_BROADCAST;
else if (ipv4_is_multicast(fl->fl4_dst))
res->type = RTN_MULTICAST;
- else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
+ else if (ipv4_is_zeronet(fl->fl4_dst))
return -EINVAL;
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
- /* get work reference to inet device */
- in_dev = in_dev_get(dev_out);
+ in_dev = __in_dev_get_rcu(dev_out);
if (!in_dev)
return -EINVAL;
if (res->type == RTN_BROADCAST) {
flags |= RTCF_BROADCAST | RTCF_LOCAL;
- if (res->fi) {
- fib_info_put(res->fi);
- res->fi = NULL;
- }
+ res->fi = NULL;
} else if (res->type == RTN_MULTICAST) {
- flags |= RTCF_MULTICAST|RTCF_LOCAL;
+ flags |= RTCF_MULTICAST | RTCF_LOCAL;
if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
oldflp->proto))
flags &= ~RTCF_LOCAL;
/* If multicast route do not exist use
- default one, but do not gateway in this case.
- Yes, it is hack.
+ * default one, but do not gateway in this case.
+ * Yes, it is hack.
*/
- if (res->fi && res->prefixlen < 4) {
- fib_info_put(res->fi);
+ if (res->fi && res->prefixlen < 4)
res->fi = NULL;
- }
}
rth = dst_alloc(&ipv4_dst_ops);
- if (!rth) {
- err = -ENOBUFS;
- goto cleanup;
- }
+ if (!rth)
+ return -ENOBUFS;
+
+ in_dev_hold(in_dev);
+ rth->idev = in_dev;
atomic_set(&rth->dst.__refcnt, 1);
rth->dst.flags= DST_HOST;
@@ -2432,7 +2422,6 @@ static int __mkroute_output(struct rtable **result,
cache entry */
rth->dst.dev = dev_out;
dev_hold(dev_out);
- rth->idev = in_dev_get(dev_out);
rth->rt_gateway = fl->fl4_dst;
rth->rt_spec_dst= fl->fl4_src;
@@ -2467,15 +2456,11 @@ static int __mkroute_output(struct rtable **result,
rt_set_nexthop(rth, res, 0);
rth->rt_flags = flags;
-
*result = rth;
- cleanup:
- /* release work reference to inet device */
- in_dev_put(in_dev);
-
- return err;
+ return 0;
}
+/* called with rcu_read_lock() */
static int ip_mkroute_output(struct rtable **rp,
struct fib_result *res,
const struct flowi *fl,
@@ -2497,6 +2482,7 @@ static int ip_mkroute_output(struct rtable **rp,
/*
* Major route resolver routine.
+ * called with rcu_read_lock();
*/
static int ip_route_output_slow(struct net *net, struct rtable **rp,
@@ -2515,9 +2501,8 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
.iif = net->loopback_dev->ifindex,
.oif = oldflp->oif };
struct fib_result res;
- unsigned flags = 0;
+ unsigned int flags = 0;
struct net_device *dev_out = NULL;
- int free_res = 0;
int err;
@@ -2543,9 +2528,9 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
if (oldflp->oif == 0 &&
(ipv4_is_multicast(oldflp->fl4_dst) ||
- oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
+ ipv4_is_lbcast(oldflp->fl4_dst))) {
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
- dev_out = ip_dev_find(net, oldflp->fl4_src);
+ dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
if (dev_out == NULL)
goto out;
@@ -2570,29 +2555,24 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
- dev_out = ip_dev_find(net, oldflp->fl4_src);
- if (dev_out == NULL)
+ if (!__ip_dev_find(net, oldflp->fl4_src, false))
goto out;
- dev_put(dev_out);
- dev_out = NULL;
}
}
if (oldflp->oif) {
- dev_out = dev_get_by_index(net, oldflp->oif);
+ dev_out = dev_get_by_index_rcu(net, oldflp->oif);
err = -ENODEV;
if (dev_out == NULL)
goto out;
/* RACE: Check return value of inet_select_addr instead. */
- if (__in_dev_get_rtnl(dev_out) == NULL) {
- dev_put(dev_out);
+ if (rcu_dereference(dev_out->ip_ptr) == NULL)
goto out; /* Wrong error code */
- }
if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
- oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
+ ipv4_is_lbcast(oldflp->fl4_dst)) {
if (!fl.fl4_src)
fl.fl4_src = inet_select_addr(dev_out, 0,
RT_SCOPE_LINK);
@@ -2612,10 +2592,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
fl.fl4_dst = fl.fl4_src;
if (!fl.fl4_dst)
fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
- if (dev_out)
- dev_put(dev_out);
dev_out = net->loopback_dev;
- dev_hold(dev_out);
fl.oif = net->loopback_dev->ifindex;
res.type = RTN_LOCAL;
flags |= RTCF_LOCAL;
@@ -2649,23 +2626,15 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
res.type = RTN_UNICAST;
goto make_route;
}
- if (dev_out)
- dev_put(dev_out);
err = -ENETUNREACH;
goto out;
}
- free_res = 1;
if (res.type == RTN_LOCAL) {
if (!fl.fl4_src)
fl.fl4_src = fl.fl4_dst;
- if (dev_out)
- dev_put(dev_out);
dev_out = net->loopback_dev;
- dev_hold(dev_out);
fl.oif = dev_out->ifindex;
- if (res.fi)
- fib_info_put(res.fi);
res.fi = NULL;
flags |= RTCF_LOCAL;
goto make_route;
@@ -2682,28 +2651,21 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
if (!fl.fl4_src)
fl.fl4_src = FIB_RES_PREFSRC(res);
- if (dev_out)
- dev_put(dev_out);
dev_out = FIB_RES_DEV(res);
- dev_hold(dev_out);
fl.oif = dev_out->ifindex;
make_route:
err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
-
- if (free_res)
- fib_res_put(&res);
- if (dev_out)
- dev_put(dev_out);
out: return err;
}
int __ip_route_output_key(struct net *net, struct rtable **rp,
const struct flowi *flp)
{
- unsigned hash;
+ unsigned int hash;
+ int res;
struct rtable *rth;
if (!rt_caching(net))
@@ -2734,10 +2696,18 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
rcu_read_unlock_bh();
slow_output:
- return ip_route_output_slow(net, rp, flp);
+ rcu_read_lock();
+ res = ip_route_output_slow(net, rp, flp);
+ rcu_read_unlock();
+ return res;
}
EXPORT_SYMBOL_GPL(__ip_route_output_key);
+static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
+{
+ return NULL;
+}
+
static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
{
}
@@ -2746,9 +2716,8 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
.family = AF_INET,
.protocol = cpu_to_be16(ETH_P_IP),
.destroy = ipv4_dst_destroy,
- .check = ipv4_dst_check,
+ .check = ipv4_blackhole_dst_check,
.update_pmtu = ipv4_rt_blackhole_update_pmtu,
- .entries = ATOMIC_INIT(0),
};
@@ -2793,7 +2762,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
dst_release(&(*rp)->dst);
*rp = rt;
- return (rt ? 0 : -ENOMEM);
+ return rt ? 0 : -ENOMEM;
}
int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
@@ -3318,6 +3287,12 @@ int __init ip_rt_init(void)
ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
+ if (dst_entries_init(&ipv4_dst_ops) < 0)
+ panic("IP: failed to allocate ipv4_dst_ops counter\n");
+
+ if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
+ panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
+
rt_hash_table = (struct rt_hash_bucket *)
alloc_large_system_hash("IP route cache",
sizeof(struct rt_hash_bucket),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 176e11aaea7..1664a0590bb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -386,8 +386,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
*/
mask = 0;
- if (sk->sk_err)
- mask = POLLERR;
/*
* POLLHUP is certainly not done right. But poll() doesn't
@@ -451,11 +449,17 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
mask |= POLLOUT | POLLWRNORM;
}
- }
+ } else
+ mask |= POLLOUT | POLLWRNORM;
if (tp->urg_data & TCP_URG_VALID)
mask |= POLLPRI;
}
+ /* This barrier is coupled with smp_wmb() in tcp_reset() */
+ smp_rmb();
+ if (sk->sk_err)
+ mask |= POLLERR;
+
return mask;
}
EXPORT_SYMBOL(tcp_poll);
@@ -939,7 +943,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
sg = sk->sk_route_caps & NETIF_F_SG;
while (--iovlen >= 0) {
- int seglen = iov->iov_len;
+ size_t seglen = iov->iov_len;
unsigned char __user *from = iov->iov_base;
iov++;
@@ -2011,11 +2015,8 @@ adjudge_to_death:
}
}
if (sk->sk_state != TCP_CLOSE) {
- int orphan_count = percpu_counter_read_positive(
- sk->sk_prot->orphan_count);
-
sk_mem_reclaim(sk);
- if (tcp_too_many_orphans(sk, orphan_count)) {
+ if (tcp_too_many_orphans(sk, 0)) {
if (net_ratelimit())
printk(KERN_INFO "TCP: too many of orphaned "
"sockets\n");
@@ -2391,7 +2392,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
err = tp->af_specific->md5_parse(sk, optval, optlen);
break;
#endif
-
+ case TCP_USER_TIMEOUT:
+ /* Cap the max timeout in ms TCP will retry/retrans
+ * before giving up and aborting (ETIMEDOUT) a connection.
+ */
+ icsk->icsk_user_timeout = msecs_to_jiffies(val);
+ break;
default:
err = -ENOPROTOOPT;
break;
@@ -2610,6 +2616,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_THIN_DUPACK:
val = tp->thin_dupack;
break;
+
+ case TCP_USER_TIMEOUT:
+ val = jiffies_to_msecs(icsk->icsk_user_timeout);
+ break;
default:
return -ENOPROTOOPT;
}
@@ -3212,7 +3222,7 @@ void __init tcp_init(void)
{
struct sk_buff *skb = NULL;
unsigned long nr_pages, limit;
- int order, i, max_share;
+ int i, max_share, cnt;
unsigned long jiffy = jiffies;
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -3261,22 +3271,12 @@ void __init tcp_init(void)
INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
}
- /* Try to be a bit smarter and adjust defaults depending
- * on available memory.
- */
- for (order = 0; ((1 << order) << PAGE_SHIFT) <
- (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
- order++)
- ;
- if (order >= 4) {
- tcp_death_row.sysctl_max_tw_buckets = 180000;
- sysctl_tcp_max_orphans = 4096 << (order - 4);
- sysctl_max_syn_backlog = 1024;
- } else if (order < 3) {
- tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
- sysctl_tcp_max_orphans >>= (3 - order);
- sysctl_max_syn_backlog = 128;
- }
+
+ cnt = tcp_hashinfo.ehash_mask + 1;
+
+ tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
+ sysctl_tcp_max_orphans = cnt / 2;
+ sysctl_max_syn_backlog = max(128, cnt / 256);
/* Set the pressure threshold to be a fraction of global memory that
* is up to 1/2 at 256 MB, decreasing toward zero with the amount of
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 0ec9bd0ae94..850c737e08e 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -196,10 +196,10 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
int tcp_set_allowed_congestion_control(char *val)
{
struct tcp_congestion_ops *ca;
- char *clone, *name;
+ char *saved_clone, *clone, *name;
int ret = 0;
- clone = kstrdup(val, GFP_USER);
+ saved_clone = clone = kstrdup(val, GFP_USER);
if (!clone)
return -ENOMEM;
@@ -226,6 +226,7 @@ int tcp_set_allowed_congestion_control(char *val)
}
out:
spin_unlock(&tcp_cong_list_lock);
+ kfree(saved_clone);
return ret;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bccce3424a6..3357f69e353 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -182,7 +182,7 @@ static void tcp_incr_quickack(struct sock *sk)
icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
}
-void tcp_enter_quickack_mode(struct sock *sk)
+static void tcp_enter_quickack_mode(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_incr_quickack(sk);
@@ -805,25 +805,12 @@ void tcp_update_metrics(struct sock *sk)
}
}
-/* Numbers are taken from RFC3390.
- *
- * John Heffner states:
- *
- * The RFC specifies a window of no more than 4380 bytes
- * unless 2*MSS > 4380. Reading the pseudocode in the RFC
- * is a bit misleading because they use a clamp at 4380 bytes
- * rather than use a multiplier in the relevant range.
- */
__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
{
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
- if (!cwnd) {
- if (tp->mss_cache > 1460)
- cwnd = 2;
- else
- cwnd = (tp->mss_cache > 1095) ? 3 : 4;
- }
+ if (!cwnd)
+ cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
@@ -2314,7 +2301,7 @@ static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
{
- return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
+ return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
}
static inline int tcp_head_timedout(struct sock *sk)
@@ -2508,7 +2495,7 @@ static void tcp_timeout_skbs(struct sock *sk)
/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
* is against sacked "cnt", otherwise it's against facked "cnt"
*/
-static void tcp_mark_head_lost(struct sock *sk, int packets)
+static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -2516,13 +2503,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
int err;
unsigned int mss;
- if (packets == 0)
- return;
-
WARN_ON(packets > tp->packets_out);
if (tp->lost_skb_hint) {
skb = tp->lost_skb_hint;
cnt = tp->lost_cnt_hint;
+ /* Head already handled? */
+ if (mark_head && skb != tcp_write_queue_head(sk))
+ return;
} else {
skb = tcp_write_queue_head(sk);
cnt = 0;
@@ -2545,7 +2532,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
cnt += tcp_skb_pcount(skb);
if (cnt > packets) {
- if (tcp_is_sack(tp) || (oldcnt >= packets))
+ if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
+ (oldcnt >= packets))
break;
mss = skb_shinfo(skb)->gso_size;
@@ -2556,6 +2544,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
}
tcp_skb_mark_lost(tp, skb);
+
+ if (mark_head)
+ break;
}
tcp_verify_left_out(tp);
}
@@ -2567,17 +2558,18 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_is_reno(tp)) {
- tcp_mark_head_lost(sk, 1);
+ tcp_mark_head_lost(sk, 1, 1);
} else if (tcp_is_fack(tp)) {
int lost = tp->fackets_out - tp->reordering;
if (lost <= 0)
lost = 1;
- tcp_mark_head_lost(sk, lost);
+ tcp_mark_head_lost(sk, lost, 0);
} else {
int sacked_upto = tp->sacked_out - tp->reordering;
- if (sacked_upto < fast_rexmit)
- sacked_upto = fast_rexmit;
- tcp_mark_head_lost(sk, sacked_upto);
+ if (sacked_upto >= 0)
+ tcp_mark_head_lost(sk, sacked_upto, 0);
+ else if (fast_rexmit)
+ tcp_mark_head_lost(sk, 1, 1);
}
tcp_timeout_skbs(sk);
@@ -2886,7 +2878,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
icsk->icsk_mtup.probe_size;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
- tp->rcv_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
icsk->icsk_mtup.probe_size = 0;
@@ -2983,7 +2975,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
before(tp->snd_una, tp->high_seq) &&
icsk->icsk_ca_state != TCP_CA_Open &&
tp->fackets_out > tp->reordering) {
- tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering);
+ tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
}
@@ -3411,8 +3403,8 @@ static void tcp_ack_probe(struct sock *sk)
static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
{
- return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
- inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
+ return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
}
static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
@@ -3429,9 +3421,9 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp,
const u32 ack, const u32 ack_seq,
const u32 nwin)
{
- return (after(ack, tp->snd_una) ||
+ return after(ack, tp->snd_una) ||
after(ack_seq, tp->snd_wl1) ||
- (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd));
+ (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
}
/* Update our send window.
@@ -4048,6 +4040,8 @@ static void tcp_reset(struct sock *sk)
default:
sk->sk_err = ECONNRESET;
}
+ /* This barrier is coupled with smp_rmb() in tcp_poll() */
+ smp_wmb();
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_error_report(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 020766292bb..8f8527d4168 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1422,7 +1422,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newsk = tcp_create_openreq_child(sk, req, skb);
if (!newsk)
- goto exit;
+ goto exit_nonewsk;
newsk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(newsk, dst);
@@ -1469,16 +1469,20 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
}
#endif
+ if (__inet_inherit_port(sk, newsk) < 0) {
+ sock_put(newsk);
+ goto exit;
+ }
__inet_hash_nolisten(newsk, NULL);
- __inet_inherit_port(sk, newsk);
return newsk;
exit_overflow:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
+ dst_release(dst);
exit:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
- dst_release(dst);
return NULL;
}
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
@@ -2571,7 +2575,6 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
return tcp_gro_receive(head, skb);
}
-EXPORT_SYMBOL(tcp4_gro_receive);
int tcp4_gro_complete(struct sk_buff *skb)
{
@@ -2584,7 +2587,6 @@ int tcp4_gro_complete(struct sk_buff *skb)
return tcp_gro_complete(skb);
}
-EXPORT_SYMBOL(tcp4_gro_complete);
struct proto tcp_prot = {
.name = "TCP",
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f25b56cb85c..43cf901d765 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -55,7 +55,7 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
return 1;
if (after(end_seq, s_win) && before(seq, e_win))
return 1;
- return (seq == e_win && seq == end_seq);
+ return seq == e_win && seq == end_seq;
}
/*
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index de3bd845858..05b1ecf3676 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -224,16 +224,10 @@ void tcp_select_initial_window(int __space, __u32 mss,
}
}
- /* Set initial window to value enough for senders,
- * following RFC2414. Senders, not following this RFC,
- * will be satisfied with 2.
- */
+ /* Set initial window to value enough for senders, following RFC5681. */
if (mss > (1 << *rcv_wscale)) {
- int init_cwnd = 4;
- if (mss > 1460 * 3)
- init_cwnd = 2;
- else if (mss > 1460)
- init_cwnd = 3;
+ int init_cwnd = rfc3390_bytes_to_packets(mss);
+
/* when initializing use the value from init_rcv_wnd
* rather than the default from above
*/
@@ -1376,9 +1370,9 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp,
const struct sk_buff *skb,
unsigned mss_now, int nonagle)
{
- return (skb->len < mss_now &&
+ return skb->len < mss_now &&
((nonagle & TCP_NAGLE_CORK) ||
- (!nonagle && tp->packets_out && tcp_minshall_check(tp))));
+ (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
}
/* Return non-zero if the Nagle test allows this packet to be
@@ -1449,10 +1443,10 @@ int tcp_may_send_now(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = tcp_send_head(sk);
- return (skb &&
+ return skb &&
tcp_snd_test(sk, skb, tcp_current_mss(sk),
(tcp_skb_is_last(sk, skb) ?
- tp->nonagle : TCP_NAGLE_PUSH)));
+ tp->nonagle : TCP_NAGLE_PUSH));
}
/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
@@ -2429,6 +2423,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
__u8 rcv_wscale;
/* Set this up on the first call only */
req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
+ req->window_clamp = tcp_full_space(sk);
+
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(tcp_full_space(sk),
mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -2555,6 +2555,11 @@ static void tcp_connect_init(struct sock *sk)
tcp_initialize_rcv_mss(sk);
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
+ tp->window_clamp = tcp_full_space(sk);
+
tcp_select_initial_window(tcp_full_space(sk),
tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
&tp->rcv_wnd,
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index f8efada580e..6211e211417 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -214,6 +214,7 @@ static const struct file_operations tcpprobe_fops = {
.owner = THIS_MODULE,
.open = tcpprobe_open,
.read = tcpprobe_read,
+ .llseek = noop_llseek,
};
static __init int tcpprobe_init(void)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 808bb920c9f..74a6aa00365 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -66,18 +66,18 @@ static void tcp_write_err(struct sock *sk)
static int tcp_out_of_resources(struct sock *sk, int do_reset)
{
struct tcp_sock *tp = tcp_sk(sk);
- int orphans = percpu_counter_read_positive(&tcp_orphan_count);
+ int shift = 0;
/* If peer does not open window for long time, or did not transmit
* anything for long time, penalize it. */
if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
- orphans <<= 1;
+ shift++;
/* If some dubious ICMP arrived, penalize even more. */
if (sk->sk_err_soft)
- orphans <<= 1;
+ shift++;
- if (tcp_too_many_orphans(sk, orphans)) {
+ if (tcp_too_many_orphans(sk, shift)) {
if (net_ratelimit())
printk(KERN_INFO "Out of socket memory\n");
@@ -135,13 +135,16 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
/* This function calculates a "timeout" which is equivalent to the timeout of a
* TCP connection after "boundary" unsuccessful, exponentially backed-off
- * retransmissions with an initial RTO of TCP_RTO_MIN.
+ * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
+ * syn_set flag is set.
*/
static bool retransmits_timed_out(struct sock *sk,
- unsigned int boundary)
+ unsigned int boundary,
+ unsigned int timeout,
+ bool syn_set)
{
- unsigned int timeout, linear_backoff_thresh;
- unsigned int start_ts;
+ unsigned int linear_backoff_thresh, start_ts;
+ unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
if (!inet_csk(sk)->icsk_retransmits)
return false;
@@ -151,14 +154,15 @@ static bool retransmits_timed_out(struct sock *sk,
else
start_ts = tcp_sk(sk)->retrans_stamp;
- linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN);
-
- if (boundary <= linear_backoff_thresh)
- timeout = ((2 << boundary) - 1) * TCP_RTO_MIN;
- else
- timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN +
- (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+ if (likely(timeout == 0)) {
+ linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
+ if (boundary <= linear_backoff_thresh)
+ timeout = ((2 << boundary) - 1) * rto_base;
+ else
+ timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
+ (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+ }
return (tcp_time_stamp - start_ts) >= timeout;
}
@@ -167,14 +171,15 @@ static int tcp_write_timeout(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
int retry_until;
- bool do_reset;
+ bool do_reset, syn_set = 0;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits)
dst_negative_advice(sk);
retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+ syn_set = 1;
} else {
- if (retransmits_timed_out(sk, sysctl_tcp_retries1)) {
+ if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
/* Black hole detection */
tcp_mtu_probing(icsk, sk);
@@ -187,14 +192,15 @@ static int tcp_write_timeout(struct sock *sk)
retry_until = tcp_orphan_retries(sk, alive);
do_reset = alive ||
- !retransmits_timed_out(sk, retry_until);
+ !retransmits_timed_out(sk, retry_until, 0, 0);
if (tcp_out_of_resources(sk, do_reset))
return 1;
}
}
- if (retransmits_timed_out(sk, retry_until)) {
+ if (retransmits_timed_out(sk, retry_until,
+ syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) {
/* Has it gone just too far? */
tcp_write_err(sk);
return 1;
@@ -361,18 +367,19 @@ void tcp_retransmit_timer(struct sock *sk)
if (icsk->icsk_retransmits == 0) {
int mib_idx;
- if (icsk->icsk_ca_state == TCP_CA_Disorder) {
- if (tcp_is_sack(tp))
- mib_idx = LINUX_MIB_TCPSACKFAILURES;
- else
- mib_idx = LINUX_MIB_TCPRENOFAILURES;
- } else if (icsk->icsk_ca_state == TCP_CA_Recovery) {
+ if (icsk->icsk_ca_state == TCP_CA_Recovery) {
if (tcp_is_sack(tp))
mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
else
mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
} else if (icsk->icsk_ca_state == TCP_CA_Loss) {
mib_idx = LINUX_MIB_TCPLOSSFAILURES;
+ } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) ||
+ tp->sacked_out) {
+ if (tcp_is_sack(tp))
+ mib_idx = LINUX_MIB_TCPSACKFAILURES;
+ else
+ mib_idx = LINUX_MIB_TCPRENOFAILURES;
} else {
mib_idx = LINUX_MIB_TCPTIMEOUTS;
}
@@ -436,7 +443,7 @@ out_reset_timer:
icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
}
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
- if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
+ if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
__sk_dst_reset(sk);
out:;
@@ -556,7 +563,14 @@ static void tcp_keepalive_timer (unsigned long data)
elapsed = keepalive_time_elapsed(tp);
if (elapsed >= keepalive_time_when(tp)) {
- if (icsk->icsk_probes_out >= keepalive_probes(tp)) {
+ /* If the TCP_USER_TIMEOUT option is enabled, use that
+ * to determine when to timeout instead.
+ */
+ if ((icsk->icsk_user_timeout != 0 &&
+ elapsed >= icsk->icsk_user_timeout &&
+ icsk->icsk_probes_out > 0) ||
+ (icsk->icsk_user_timeout == 0 &&
+ icsk->icsk_probes_out >= keepalive_probes(tp))) {
tcp_send_active_reset(sk, GFP_ATOMIC);
tcp_write_err(sk);
goto out;
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 20151d6a624..a534dda5456 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -80,7 +80,7 @@ static void tcp_westwood_init(struct sock *sk)
*/
static inline u32 westwood_do_filter(u32 a, u32 b)
{
- return (((7 * a) + b) >> 3);
+ return ((7 * a) + b) >> 3;
}
static void westwood_filter(struct westwood *w, u32 delta)
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 59186ca7808..9a17bd2a0a3 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -14,8 +14,8 @@
#include <net/protocol.h>
#include <net/xfrm.h>
-static struct xfrm_tunnel *tunnel4_handlers;
-static struct xfrm_tunnel *tunnel64_handlers;
+static struct xfrm_tunnel *tunnel4_handlers __read_mostly;
+static struct xfrm_tunnel *tunnel64_handlers __read_mostly;
static DEFINE_MUTEX(tunnel4_mutex);
static inline struct xfrm_tunnel **fam_handlers(unsigned short family)
@@ -39,7 +39,7 @@ int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family)
}
handler->next = *pprev;
- *pprev = handler;
+ rcu_assign_pointer(*pprev, handler);
ret = 0;
@@ -73,6 +73,11 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
}
EXPORT_SYMBOL(xfrm4_tunnel_deregister);
+#define for_each_tunnel_rcu(head, handler) \
+ for (handler = rcu_dereference(head); \
+ handler != NULL; \
+ handler = rcu_dereference(handler->next)) \
+
static int tunnel4_rcv(struct sk_buff *skb)
{
struct xfrm_tunnel *handler;
@@ -80,7 +85,7 @@ static int tunnel4_rcv(struct sk_buff *skb)
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto drop;
- for (handler = tunnel4_handlers; handler; handler = handler->next)
+ for_each_tunnel_rcu(tunnel4_handlers, handler)
if (!handler->handler(skb))
return 0;
@@ -99,7 +104,7 @@ static int tunnel64_rcv(struct sk_buff *skb)
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
goto drop;
- for (handler = tunnel64_handlers; handler; handler = handler->next)
+ for_each_tunnel_rcu(tunnel64_handlers, handler)
if (!handler->handler(skb))
return 0;
@@ -115,7 +120,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
- for (handler = tunnel4_handlers; handler; handler = handler->next)
+ for_each_tunnel_rcu(tunnel4_handlers, handler)
if (!handler->err_handler(skb, info))
break;
}
@@ -125,7 +130,7 @@ static void tunnel64_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
- for (handler = tunnel64_handlers; handler; handler = handler->next)
+ for_each_tunnel_rcu(tunnel64_handlers, handler)
if (!handler->err_handler(skb, info))
break;
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 32e0bef60d0..b3f7e8cf18a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -797,7 +797,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
return -EOPNOTSUPP;
ipc.opt = NULL;
- ipc.shtx.flags = 0;
+ ipc.tx_flags = 0;
if (up->pending) {
/*
@@ -845,7 +845,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.addr = inet->inet_saddr;
ipc.oif = sk->sk_bound_dev_if;
- err = sock_tx_timestamp(msg, sk, &ipc.shtx);
+ err = sock_tx_timestamp(sk, &ipc.tx_flags);
if (err)
return err;
if (msg->msg_controllen) {
@@ -1260,6 +1260,49 @@ void udp_lib_unhash(struct sock *sk)
}
EXPORT_SYMBOL(udp_lib_unhash);
+/*
+ * inet_rcv_saddr was changed, we must rehash secondary hash
+ */
+void udp_lib_rehash(struct sock *sk, u16 newhash)
+{
+ if (sk_hashed(sk)) {
+ struct udp_table *udptable = sk->sk_prot->h.udp_table;
+ struct udp_hslot *hslot, *hslot2, *nhslot2;
+
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+ nhslot2 = udp_hashslot2(udptable, newhash);
+ udp_sk(sk)->udp_portaddr_hash = newhash;
+ if (hslot2 != nhslot2) {
+ hslot = udp_hashslot(udptable, sock_net(sk),
+ udp_sk(sk)->udp_port_hash);
+ /* we must lock primary chain too */
+ spin_lock_bh(&hslot->lock);
+
+ spin_lock(&hslot2->lock);
+ hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+ hslot2->count--;
+ spin_unlock(&hslot2->lock);
+
+ spin_lock(&nhslot2->lock);
+ hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &nhslot2->head);
+ nhslot2->count++;
+ spin_unlock(&nhslot2->lock);
+
+ spin_unlock_bh(&hslot->lock);
+ }
+ }
+}
+EXPORT_SYMBOL(udp_lib_rehash);
+
+static void udp_v4_rehash(struct sock *sk)
+{
+ u16 new_hash = udp4_portaddr_hash(sock_net(sk),
+ inet_sk(sk)->inet_rcv_saddr,
+ inet_sk(sk)->inet_num);
+ udp_lib_rehash(sk, new_hash);
+}
+
static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int rc;
@@ -1843,6 +1886,7 @@ struct proto udp_prot = {
.backlog_rcv = __udp_queue_rcv_skb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
+ .rehash = udp_v4_rehash,
.get_port = udp_v4_get_port,
.memory_allocated = &udp_memory_allocated,
.sysctl_mem = sysctl_udp_mem,
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 869078d4eeb..4464f3bff6a 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -61,7 +61,7 @@ static int xfrm4_get_saddr(struct net *net,
static int xfrm4_get_tos(struct flowi *fl)
{
- return fl->fl4_tos;
+ return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */
}
static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
@@ -174,7 +174,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
xfrm4_policy_afinfo.garbage_collect(net);
- return (atomic_read(&ops->entries) > ops->gc_thresh * 2);
+ return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
}
static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -232,7 +232,6 @@ static struct dst_ops xfrm4_dst_ops = {
.ifdown = xfrm4_dst_ifdown,
.local_out = __ip_local_out,
.gc_thresh = 1024,
- .entries = ATOMIC_INIT(0),
};
static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
@@ -288,6 +287,7 @@ void __init xfrm4_init(int rt_max_size)
* and start cleaning when were 1/2 full
*/
xfrm4_dst_ops.gc_thresh = rt_max_size/2;
+ dst_entries_init(&xfrm4_dst_ops);
xfrm4_state_init();
xfrm4_policy_init();
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 1ef1366a0a0..47947624ecc 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -21,21 +21,25 @@ static int xfrm4_init_flags(struct xfrm_state *x)
}
static void
-__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
- struct xfrm_tmpl *tmpl,
- xfrm_address_t *daddr, xfrm_address_t *saddr)
+__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
+{
+ sel->daddr.a4 = fl->fl4_dst;
+ sel->saddr.a4 = fl->fl4_src;
+ sel->dport = xfrm_flowi_dport(fl);
+ sel->dport_mask = htons(0xffff);
+ sel->sport = xfrm_flowi_sport(fl);
+ sel->sport_mask = htons(0xffff);
+ sel->family = AF_INET;
+ sel->prefixlen_d = 32;
+ sel->prefixlen_s = 32;
+ sel->proto = fl->proto;
+ sel->ifindex = fl->oif;
+}
+
+static void
+xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
+ xfrm_address_t *daddr, xfrm_address_t *saddr)
{
- x->sel.daddr.a4 = fl->fl4_dst;
- x->sel.saddr.a4 = fl->fl4_src;
- x->sel.dport = xfrm_flowi_dport(fl);
- x->sel.dport_mask = htons(0xffff);
- x->sel.sport = xfrm_flowi_sport(fl);
- x->sel.sport_mask = htons(0xffff);
- x->sel.family = AF_INET;
- x->sel.prefixlen_d = 32;
- x->sel.prefixlen_s = 32;
- x->sel.proto = fl->proto;
- x->sel.ifindex = fl->oif;
x->id = tmpl->id;
if (x->id.daddr.a4 == 0)
x->id.daddr.a4 = daddr->a4;
@@ -70,6 +74,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
.owner = THIS_MODULE,
.init_flags = xfrm4_init_flags,
.init_tempsel = __xfrm4_init_tempsel,
+ .init_temprop = xfrm4_init_temprop,
.output = xfrm4_output,
.extract_input = xfrm4_extract_input,
.extract_output = xfrm4_extract_output,
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 41f5982d208..82806455e85 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -58,14 +58,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
return -ENOENT;
}
-static struct xfrm_tunnel xfrm_tunnel_handler = {
+static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
.handler = xfrm_tunnel_rcv,
.err_handler = xfrm_tunnel_err,
.priority = 2,
};
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static struct xfrm_tunnel xfrm64_tunnel_handler = {
+static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
.handler = xfrm_tunnel_rcv,
.err_handler = xfrm_tunnel_err,
.priority = 2,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ab70a3fbcaf..ec7a91d9e86 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -243,7 +243,7 @@ static inline bool addrconf_qdisc_ok(const struct net_device *dev)
/* Check if a route is valid prefix route */
static inline int addrconf_is_prefix_route(const struct rt6_info *rt)
{
- return ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0);
+ return (rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0;
}
static void addrconf_del_timer(struct inet6_ifaddr *ifp)
@@ -1544,7 +1544,7 @@ static int addrconf_ifid_infiniband(u8 *eui, struct net_device *dev)
return 0;
}
-int __ipv6_isatap_ifid(u8 *eui, __be32 addr)
+static int __ipv6_isatap_ifid(u8 *eui, __be32 addr)
{
if (addr == 0)
return -1;
@@ -1560,7 +1560,6 @@ int __ipv6_isatap_ifid(u8 *eui, __be32 addr)
memcpy(eui + 4, &addr, 4);
return 0;
}
-EXPORT_SYMBOL(__ipv6_isatap_ifid);
static int addrconf_ifid_sit(u8 *eui, struct net_device *dev)
{
@@ -2964,7 +2963,8 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
start sending router solicitations.
*/
- if (ifp->idev->cnf.forwarding == 0 &&
+ if ((ifp->idev->cnf.forwarding == 0 ||
+ ifp->idev->cnf.forwarding == 2) &&
ifp->idev->cnf.rtr_solicits > 0 &&
(dev->flags&IFF_LOOPBACK) == 0 &&
(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) {
@@ -4637,10 +4637,12 @@ int __init addrconf_init(void)
if (err < 0) {
printk(KERN_CRIT "IPv6 Addrconf:"
" cannot initialize default policy table: %d.\n", err);
- return err;
+ goto out;
}
- register_pernet_subsys(&addrconf_ops);
+ err = register_pernet_subsys(&addrconf_ops);
+ if (err < 0)
+ goto out_addrlabel;
/* The addrconf netdev notifier requires that loopback_dev
* has it's ipv6 private information allocated and setup
@@ -4692,7 +4694,9 @@ errout:
unregister_netdevice_notifier(&ipv6_dev_notf);
errlo:
unregister_pernet_subsys(&addrconf_ops);
-
+out_addrlabel:
+ ipv6_addr_label_cleanup();
+out:
return err;
}
@@ -4703,6 +4707,7 @@ void addrconf_cleanup(void)
unregister_netdevice_notifier(&ipv6_dev_notf);
unregister_pernet_subsys(&addrconf_ops);
+ ipv6_addr_label_cleanup();
rtnl_lock();
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index f0e774cea38..c8993e5a337 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -393,6 +393,11 @@ int __init ipv6_addr_label_init(void)
return register_pernet_subsys(&ipv6_addr_label_ops);
}
+void ipv6_addr_label_cleanup(void)
+{
+ unregister_pernet_subsys(&ipv6_addr_label_ops);
+}
+
static const struct nla_policy ifal_policy[IFAL_MAX+1] = {
[IFAL_ADDRESS] = { .len = sizeof(struct in6_addr), },
[IFAL_LABEL] = { .len = sizeof(u32), },
@@ -513,10 +518,9 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb)
static inline int ip6addrlbl_msgsize(void)
{
- return (NLMSG_ALIGN(sizeof(struct ifaddrlblmsg))
+ return NLMSG_ALIGN(sizeof(struct ifaddrlblmsg))
+ nla_total_size(16) /* IFAL_ADDRESS */
- + nla_total_size(4) /* IFAL_LABEL */
- );
+ + nla_total_size(4); /* IFAL_LABEL */
}
static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 56b9bf2516f..54e8e42f7a8 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -343,7 +343,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
*/
v4addr = LOOPBACK4_IPV6;
if (!(addr_type & IPV6_ADDR_MULTICAST)) {
- if (!ipv6_chk_addr(net, &addr->sin6_addr,
+ if (!inet->transparent &&
+ !ipv6_chk_addr(net, &addr->sin6_addr,
dev, 0)) {
err = -EADDRNOTAVAIL;
goto out_unlock;
@@ -467,7 +468,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL)
sin->sin6_scope_id = sk->sk_bound_dev_if;
*uaddr_len = sizeof(*sin);
- return(0);
+ return 0;
}
EXPORT_SYMBOL(inet6_getname);
@@ -488,7 +489,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCADDRT:
case SIOCDELRT:
- return(ipv6_route_ioctl(net, cmd, (void __user *)arg));
+ return ipv6_route_ioctl(net, cmd, (void __user *)arg);
case SIOCSIFADDR:
return addrconf_add_ifaddr(net, (void __user *) arg);
@@ -502,7 +503,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
return sk->sk_prot->ioctl(sk, cmd, arg);
}
/*NOTREACHED*/
- return(0);
+ return 0;
}
EXPORT_SYMBOL(inet6_ioctl);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 7d929a22cbc..320bdb877ee 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -105,9 +105,12 @@ ipv4_connected:
if (ipv6_addr_any(&np->saddr))
ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
- if (ipv6_addr_any(&np->rcv_saddr))
+ if (ipv6_addr_any(&np->rcv_saddr)) {
ipv6_addr_set_v4mapped(inet->inet_rcv_saddr,
&np->rcv_saddr);
+ if (sk->sk_prot->rehash)
+ sk->sk_prot->rehash(sk);
+ }
goto out;
}
@@ -181,6 +184,8 @@ ipv4_connected:
if (ipv6_addr_any(&np->rcv_saddr)) {
ipv6_addr_copy(&np->rcv_saddr, &fl.fl6_src);
inet->inet_rcv_saddr = LOOPBACK4_IPV6;
+ if (sk->sk_prot->rehash)
+ sk->sk_prot->rehash(sk);
}
ip6_dst_store(sk, dst,
@@ -572,6 +577,25 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
u8 *ptr = nh + opt->dst1;
put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr);
}
+ if (np->rxopt.bits.rxorigdstaddr) {
+ struct sockaddr_in6 sin6;
+ u16 *ports = (u16 *) skb_transport_header(skb);
+
+ if (skb_transport_offset(skb) + 4 <= skb->len) {
+ /* All current transport protocols have the port numbers in the
+ * first four bytes of the transport header and this function is
+ * written with this assumption in mind.
+ */
+
+ sin6.sin6_family = AF_INET6;
+ ipv6_addr_copy(&sin6.sin6_addr, &ipv6_hdr(skb)->daddr);
+ sin6.sin6_port = ports[1];
+ sin6.sin6_flowinfo = 0;
+ sin6.sin6_scope_id = 0;
+
+ put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6);
+ }
+ }
return 0;
}
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index e1caa5d526c..14ed0a955b5 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -13,12 +13,12 @@ int ipv6_ext_hdr(u8 nexthdr)
/*
* find out if nexthdr is an extension header or a protocol
*/
- return ( (nexthdr == NEXTHDR_HOP) ||
+ return (nexthdr == NEXTHDR_HOP) ||
(nexthdr == NEXTHDR_ROUTING) ||
(nexthdr == NEXTHDR_FRAGMENT) ||
(nexthdr == NEXTHDR_AUTH) ||
(nexthdr == NEXTHDR_NONE) ||
- (nexthdr == NEXTHDR_DEST) );
+ (nexthdr == NEXTHDR_DEST);
}
/*
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index b1108ede18e..d829874d894 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -34,11 +34,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi *fl,
{
struct fib_lookup_arg arg = {
.lookup_ptr = lookup,
+ .flags = FIB_LOOKUP_NOREF,
};
fib_rules_lookup(net->ipv6.fib6_rules_ops, fl, flags, &arg);
- if (arg.rule)
- fib_rule_put(arg.rule);
if (arg.result)
return arg.result;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index b6a585909d3..de382114609 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1500,15 +1500,18 @@ static void fib6_gc_timer_cb(unsigned long arg)
static int __net_init fib6_net_init(struct net *net)
{
+ size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
+
setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net);
net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
if (!net->ipv6.rt6_stats)
goto out_timer;
- net->ipv6.fib_table_hash = kcalloc(FIB6_TABLE_HASHSZ,
- sizeof(*net->ipv6.fib_table_hash),
- GFP_KERNEL);
+ /* Avoid false sharing : Use at least a full cache line */
+ size = max_t(size_t, size, L1_CACHE_BYTES);
+
+ net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
if (!net->ipv6.fib_table_hash)
goto out_rt6_stats;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index d40b330c0ee..99157b4cd56 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -637,9 +637,9 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
}
mtu -= hlen + sizeof(struct frag_hdr);
- if (skb_has_frags(skb)) {
+ if (skb_has_frag_list(skb)) {
int first_len = skb_pagelen(skb);
- int truesizes = 0;
+ struct sk_buff *frag2;
if (first_len - hlen > mtu ||
((first_len - hlen) & 7) ||
@@ -651,18 +651,18 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
if (frag->len > mtu ||
((frag->len & 7) && frag->next) ||
skb_headroom(frag) < hlen)
- goto slow_path;
+ goto slow_path_clean;
/* Partially cloned skb? */
if (skb_shared(frag))
- goto slow_path;
+ goto slow_path_clean;
BUG_ON(frag->sk);
if (skb->sk) {
frag->sk = skb->sk;
frag->destructor = sock_wfree;
- truesizes += frag->truesize;
}
+ skb->truesize -= frag->truesize;
}
err = 0;
@@ -693,7 +693,6 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
first_len = skb_pagelen(skb);
skb->data_len = first_len - skb_headlen(skb);
- skb->truesize -= truesizes;
skb->len = first_len;
ipv6_hdr(skb)->payload_len = htons(first_len -
sizeof(struct ipv6hdr));
@@ -756,6 +755,15 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
IPSTATS_MIB_FRAGFAILS);
dst_release(&rt->dst);
return err;
+
+slow_path_clean:
+ skb_walk_frags(skb, frag2) {
+ if (frag2 == frag)
+ break;
+ frag2->sk = NULL;
+ frag2->destructor = NULL;
+ skb->truesize += frag2->truesize;
+ }
}
slow_path:
@@ -870,8 +878,8 @@ static inline int ip6_rt_check(struct rt6key *rt_key,
struct in6_addr *fl_addr,
struct in6_addr *addr_cache)
{
- return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
- (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
+ return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
+ (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
}
static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 0fd027f3f47..c2c0f89397b 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -75,7 +75,7 @@ MODULE_LICENSE("GPL");
(addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \
(HASH_SIZE - 1))
-static void ip6_tnl_dev_init(struct net_device *dev);
+static int ip6_tnl_dev_init(struct net_device *dev);
static void ip6_tnl_dev_setup(struct net_device *dev);
static int ip6_tnl_net_id __read_mostly;
@@ -83,15 +83,42 @@ struct ip6_tnl_net {
/* the IPv6 tunnel fallback device */
struct net_device *fb_tnl_dev;
/* lists for storing tunnels in use */
- struct ip6_tnl *tnls_r_l[HASH_SIZE];
- struct ip6_tnl *tnls_wc[1];
- struct ip6_tnl **tnls[2];
+ struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE];
+ struct ip6_tnl __rcu *tnls_wc[1];
+ struct ip6_tnl __rcu **tnls[2];
};
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+ unsigned long rx_packets;
+ unsigned long rx_bytes;
+ unsigned long tx_packets;
+ unsigned long tx_bytes;
+};
+
+static struct net_device_stats *ip6_get_stats(struct net_device *dev)
+{
+ struct pcpu_tstats sum = { 0 };
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+
+ sum.rx_packets += tstats->rx_packets;
+ sum.rx_bytes += tstats->rx_bytes;
+ sum.tx_packets += tstats->tx_packets;
+ sum.tx_bytes += tstats->tx_bytes;
+ }
+ dev->stats.rx_packets = sum.rx_packets;
+ dev->stats.rx_bytes = sum.rx_bytes;
+ dev->stats.tx_packets = sum.tx_packets;
+ dev->stats.tx_bytes = sum.tx_bytes;
+ return &dev->stats;
+}
+
/*
- * Locking : hash tables are protected by RCU and a spinlock
+ * Locking : hash tables are protected by RCU and RTNL
*/
-static DEFINE_SPINLOCK(ip6_tnl_lock);
static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t)
{
@@ -138,8 +165,8 @@ static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst)
static struct ip6_tnl *
ip6_tnl_lookup(struct net *net, struct in6_addr *remote, struct in6_addr *local)
{
- unsigned h0 = HASH(remote);
- unsigned h1 = HASH(local);
+ unsigned int h0 = HASH(remote);
+ unsigned int h1 = HASH(local);
struct ip6_tnl *t;
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
@@ -167,7 +194,7 @@ ip6_tnl_lookup(struct net *net, struct in6_addr *remote, struct in6_addr *local)
* Return: head of IPv6 tunnel list
**/
-static struct ip6_tnl **
+static struct ip6_tnl __rcu **
ip6_tnl_bucket(struct ip6_tnl_net *ip6n, struct ip6_tnl_parm *p)
{
struct in6_addr *remote = &p->raddr;
@@ -190,12 +217,10 @@ ip6_tnl_bucket(struct ip6_tnl_net *ip6n, struct ip6_tnl_parm *p)
static void
ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
{
- struct ip6_tnl **tp = ip6_tnl_bucket(ip6n, &t->parms);
+ struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms);
- spin_lock_bh(&ip6_tnl_lock);
- t->next = *tp;
+ rcu_assign_pointer(t->next , rtnl_dereference(*tp));
rcu_assign_pointer(*tp, t);
- spin_unlock_bh(&ip6_tnl_lock);
}
/**
@@ -206,18 +231,25 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
static void
ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
{
- struct ip6_tnl **tp;
-
- for (tp = ip6_tnl_bucket(ip6n, &t->parms); *tp; tp = &(*tp)->next) {
- if (t == *tp) {
- spin_lock_bh(&ip6_tnl_lock);
- *tp = t->next;
- spin_unlock_bh(&ip6_tnl_lock);
+ struct ip6_tnl __rcu **tp;
+ struct ip6_tnl *iter;
+
+ for (tp = ip6_tnl_bucket(ip6n, &t->parms);
+ (iter = rtnl_dereference(*tp)) != NULL;
+ tp = &iter->next) {
+ if (t == iter) {
+ rcu_assign_pointer(*tp, t->next);
break;
}
}
}
+static void ip6_dev_free(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
/**
* ip6_tnl_create() - create a new tunnel
* @p: tunnel parameters
@@ -256,7 +288,9 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct ip6_tnl_parm *p)
t = netdev_priv(dev);
t->parms = *p;
- ip6_tnl_dev_init(dev);
+ err = ip6_tnl_dev_init(dev);
+ if (err < 0)
+ goto failed_free;
if ((err = register_netdevice(dev)) < 0)
goto failed_free;
@@ -266,7 +300,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct ip6_tnl_parm *p)
return t;
failed_free:
- free_netdev(dev);
+ ip6_dev_free(dev);
failed:
return NULL;
}
@@ -290,10 +324,13 @@ static struct ip6_tnl *ip6_tnl_locate(struct net *net,
{
struct in6_addr *remote = &p->raddr;
struct in6_addr *local = &p->laddr;
+ struct ip6_tnl __rcu **tp;
struct ip6_tnl *t;
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
- for (t = *ip6_tnl_bucket(ip6n, p); t; t = t->next) {
+ for (tp = ip6_tnl_bucket(ip6n, p);
+ (t = rtnl_dereference(*tp)) != NULL;
+ tp = &t->next) {
if (ipv6_addr_equal(local, &t->parms.laddr) &&
ipv6_addr_equal(remote, &t->parms.raddr))
return t;
@@ -318,13 +355,10 @@ ip6_tnl_dev_uninit(struct net_device *dev)
struct net *net = dev_net(dev);
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
- if (dev == ip6n->fb_tnl_dev) {
- spin_lock_bh(&ip6_tnl_lock);
- ip6n->tnls_wc[0] = NULL;
- spin_unlock_bh(&ip6_tnl_lock);
- } else {
+ if (dev == ip6n->fb_tnl_dev)
+ rcu_assign_pointer(ip6n->tnls_wc[0], NULL);
+ else
ip6_tnl_unlink(ip6n, t);
- }
ip6_tnl_dst_reset(t);
dev_put(dev);
}
@@ -702,6 +736,8 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr,
&ipv6h->daddr)) != NULL) {
+ struct pcpu_tstats *tstats;
+
if (t->parms.proto != ipproto && t->parms.proto != 0) {
rcu_read_unlock();
goto discard;
@@ -724,10 +760,16 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
skb->pkt_type = PACKET_HOST;
memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
- skb_tunnel_rx(skb, t->dev);
+ tstats = this_cpu_ptr(t->dev->tstats);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+
+ __skb_tunnel_rx(skb, t->dev);
dscp_ecn_decapsulate(t, ipv6h, skb);
+
netif_rx(skb);
+
rcu_read_unlock();
return 0;
}
@@ -934,8 +976,10 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
err = ip6_local_out(skb);
if (net_xmit_eval(err) == 0) {
- stats->tx_bytes += pkt_len;
- stats->tx_packets++;
+ struct pcpu_tstats *tstats = this_cpu_ptr(t->dev->tstats);
+
+ tstats->tx_bytes += pkt_len;
+ tstats->tx_packets++;
} else {
stats->tx_errors++;
stats->tx_aborted_errors++;
@@ -1300,12 +1344,14 @@ ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
static const struct net_device_ops ip6_tnl_netdev_ops = {
- .ndo_uninit = ip6_tnl_dev_uninit,
+ .ndo_uninit = ip6_tnl_dev_uninit,
.ndo_start_xmit = ip6_tnl_xmit,
- .ndo_do_ioctl = ip6_tnl_ioctl,
+ .ndo_do_ioctl = ip6_tnl_ioctl,
.ndo_change_mtu = ip6_tnl_change_mtu,
+ .ndo_get_stats = ip6_get_stats,
};
+
/**
* ip6_tnl_dev_setup - setup virtual tunnel device
* @dev: virtual device associated with tunnel
@@ -1317,7 +1363,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
static void ip6_tnl_dev_setup(struct net_device *dev)
{
dev->netdev_ops = &ip6_tnl_netdev_ops;
- dev->destructor = free_netdev;
+ dev->destructor = ip6_dev_free;
dev->type = ARPHRD_TUNNEL6;
dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr);
@@ -1333,12 +1379,17 @@ static void ip6_tnl_dev_setup(struct net_device *dev)
* @dev: virtual device associated with tunnel
**/
-static inline void
+static inline int
ip6_tnl_dev_init_gen(struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
+
t->dev = dev;
strcpy(t->parms.name, dev->name);
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+ return 0;
}
/**
@@ -1346,11 +1397,15 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
* @dev: virtual device associated with tunnel
**/
-static void ip6_tnl_dev_init(struct net_device *dev)
+static int ip6_tnl_dev_init(struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
- ip6_tnl_dev_init_gen(dev);
+ int err = ip6_tnl_dev_init_gen(dev);
+
+ if (err)
+ return err;
ip6_tnl_link_config(t);
+ return 0;
}
/**
@@ -1360,25 +1415,29 @@ static void ip6_tnl_dev_init(struct net_device *dev)
* Return: 0
**/
-static void __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
+static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
struct net *net = dev_net(dev);
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+ int err = ip6_tnl_dev_init_gen(dev);
+
+ if (err)
+ return err;
- ip6_tnl_dev_init_gen(dev);
t->parms.proto = IPPROTO_IPV6;
dev_hold(dev);
- ip6n->tnls_wc[0] = t;
+ rcu_assign_pointer(ip6n->tnls_wc[0], t);
+ return 0;
}
-static struct xfrm6_tunnel ip4ip6_handler = {
+static struct xfrm6_tunnel ip4ip6_handler __read_mostly = {
.handler = ip4ip6_rcv,
.err_handler = ip4ip6_err,
.priority = 1,
};
-static struct xfrm6_tunnel ip6ip6_handler = {
+static struct xfrm6_tunnel ip6ip6_handler __read_mostly = {
.handler = ip6ip6_rcv,
.err_handler = ip6ip6_err,
.priority = 1,
@@ -1391,14 +1450,14 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct ip6_tnl_net *ip6n)
LIST_HEAD(list);
for (h = 0; h < HASH_SIZE; h++) {
- t = ip6n->tnls_r_l[h];
+ t = rtnl_dereference(ip6n->tnls_r_l[h]);
while (t != NULL) {
unregister_netdevice_queue(t->dev, &list);
- t = t->next;
+ t = rtnl_dereference(t->next);
}
}
- t = ip6n->tnls_wc[0];
+ t = rtnl_dereference(ip6n->tnls_wc[0]);
unregister_netdevice_queue(t->dev, &list);
unregister_netdevice_many(&list);
}
@@ -1419,7 +1478,9 @@ static int __net_init ip6_tnl_init_net(struct net *net)
goto err_alloc_dev;
dev_net_set(ip6n->fb_tnl_dev, net);
- ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev);
+ err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev);
+ if (err < 0)
+ goto err_register;
err = register_netdev(ip6n->fb_tnl_dev);
if (err < 0)
@@ -1427,7 +1488,7 @@ static int __net_init ip6_tnl_init_net(struct net *net)
return 0;
err_register:
- free_netdev(ip6n->fb_tnl_dev);
+ ip6_dev_free(ip6n->fb_tnl_dev);
err_alloc_dev:
return err;
}
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 66078dad7fe..6f32ffce702 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -667,6 +667,7 @@ static int pim6_rcv(struct sk_buff *skb)
skb_tunnel_rx(skb, reg_dev);
netif_rx(skb);
+
dev_put(reg_dev);
return 0;
drop:
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index a7f66bc8f0b..0553867a317 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -342,6 +342,21 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
retv = 0;
break;
+ case IPV6_TRANSPARENT:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */
+ inet_sk(sk)->transparent = valbool;
+ retv = 0;
+ break;
+
+ case IPV6_RECVORIGDSTADDR:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->rxopt.bits.rxorigdstaddr = valbool;
+ retv = 0;
+ break;
+
case IPV6_HOPOPTS:
case IPV6_RTHDRDSTOPTS:
case IPV6_RTHDR:
@@ -1104,6 +1119,14 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
break;
}
+ case IPV6_TRANSPARENT:
+ val = inet_sk(sk)->transparent;
+ break;
+
+ case IPV6_RECVORIGDSTADDR:
+ val = np->rxopt.bits.rxorigdstaddr;
+ break;
+
case IPV6_UNICAST_HOPS:
case IPV6_MULTICAST_HOPS:
{
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 58841c4ae94..998d6d27e7c 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -91,7 +91,9 @@
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
-static u32 ndisc_hash(const void *pkey, const struct net_device *dev);
+static u32 ndisc_hash(const void *pkey,
+ const struct net_device *dev,
+ __u32 rnd);
static int ndisc_constructor(struct neighbour *neigh);
static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -228,12 +230,12 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
do {
cur = ((void *)cur) + (cur->nd_opt_len << 3);
} while(cur < end && cur->nd_opt_type != type);
- return (cur <= end && cur->nd_opt_type == type ? cur : NULL);
+ return cur <= end && cur->nd_opt_type == type ? cur : NULL;
}
static inline int ndisc_is_useropt(struct nd_opt_hdr *opt)
{
- return (opt->nd_opt_type == ND_OPT_RDNSS);
+ return opt->nd_opt_type == ND_OPT_RDNSS;
}
static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur,
@@ -244,7 +246,7 @@ static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur,
do {
cur = ((void *)cur) + (cur->nd_opt_len << 3);
} while(cur < end && !ndisc_is_useropt(cur));
- return (cur <= end && ndisc_is_useropt(cur) ? cur : NULL);
+ return cur <= end && ndisc_is_useropt(cur) ? cur : NULL;
}
static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
@@ -319,7 +321,7 @@ static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p,
int prepad = ndisc_addr_option_pad(dev->type);
if (lladdrlen != NDISC_OPT_SPACE(dev->addr_len + prepad))
return NULL;
- return (lladdr + prepad);
+ return lladdr + prepad;
}
int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int dir)
@@ -350,7 +352,9 @@ int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int d
EXPORT_SYMBOL(ndisc_mc_map);
-static u32 ndisc_hash(const void *pkey, const struct net_device *dev)
+static u32 ndisc_hash(const void *pkey,
+ const struct net_device *dev,
+ __u32 hash_rnd)
{
const u32 *p32 = pkey;
u32 addr_hash, i;
@@ -359,7 +363,7 @@ static u32 ndisc_hash(const void *pkey, const struct net_device *dev)
for (i = 0; i < (sizeof(struct in6_addr) / sizeof(u32)); i++)
addr_hash ^= *p32++;
- return jhash_2words(addr_hash, dev->ifindex, nd_tbl.hash_rnd);
+ return jhash_2words(addr_hash, dev->ifindex, hash_rnd);
}
static int ndisc_constructor(struct neighbour *neigh)
@@ -1105,6 +1109,18 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err);
}
+static inline int accept_ra(struct inet6_dev *in6_dev)
+{
+ /*
+ * If forwarding is enabled, RA are not accepted unless the special
+ * hybrid mode (accept_ra=2) is enabled.
+ */
+ if (in6_dev->cnf.forwarding && in6_dev->cnf.accept_ra < 2)
+ return 0;
+
+ return in6_dev->cnf.accept_ra;
+}
+
static void ndisc_router_discovery(struct sk_buff *skb)
{
struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb);
@@ -1158,8 +1174,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
return;
}
- /* skip route and link configuration on routers */
- if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_ra)
+ if (!accept_ra(in6_dev))
goto skip_linkparms;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
@@ -1309,8 +1324,7 @@ skip_linkparms:
NEIGH_UPDATE_F_ISROUTER);
}
- /* skip route and link configuration on routers */
- if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_ra)
+ if (!accept_ra(in6_dev))
goto out;
#ifdef CONFIG_IPV6_ROUTE_INFO
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 29d643bcafa..44d2eeac089 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -132,10 +132,10 @@ config IP6_NF_MATCH_RT
# The targets
config IP6_NF_TARGET_HL
tristate '"HL" hoplimit target support'
- depends on NETFILTER_ADVANCED
+ depends on NETFILTER_ADVANCED && IP6_NF_MANGLE
select NETFILTER_XT_TARGET_HL
---help---
- This is a backwards-compat option for the user's convenience
+ This is a backwards-compatible option for the user's convenience
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_TARGET_HL.
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index aafbba30c89..3f8e4a3d83c 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -11,10 +11,11 @@ obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o
# objects for l3 independent conntrack
-nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o nf_conntrack_reasm.o
+nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
+nf_defrag_ipv6-objs := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
# l3 independent conntrack
-obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
+obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_defrag_ipv6.o
# matches
obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 5359ef4daac..51df035897e 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -82,13 +82,13 @@ EXPORT_SYMBOL_GPL(ip6t_alloc_initial_table);
int
ip6t_ext_hdr(u8 nexthdr)
{
- return ( (nexthdr == IPPROTO_HOPOPTS) ||
- (nexthdr == IPPROTO_ROUTING) ||
- (nexthdr == IPPROTO_FRAGMENT) ||
- (nexthdr == IPPROTO_ESP) ||
- (nexthdr == IPPROTO_AH) ||
- (nexthdr == IPPROTO_NONE) ||
- (nexthdr == IPPROTO_DSTOPTS) );
+ return (nexthdr == IPPROTO_HOPOPTS) ||
+ (nexthdr == IPPROTO_ROUTING) ||
+ (nexthdr == IPPROTO_FRAGMENT) ||
+ (nexthdr == IPPROTO_ESP) ||
+ (nexthdr == IPPROTO_AH) ||
+ (nexthdr == IPPROTO_NONE) ||
+ (nexthdr == IPPROTO_DSTOPTS);
}
/* Returns whether matches rule or not. */
@@ -215,7 +215,7 @@ static inline bool unconditional(const struct ip6t_ip6 *ipv6)
return memcmp(ipv6, &uncond, sizeof(uncond)) == 0;
}
-static inline const struct ip6t_entry_target *
+static inline const struct xt_entry_target *
ip6t_get_target_c(const struct ip6t_entry *e)
{
return ip6t_get_target((struct ip6t_entry *)e);
@@ -260,9 +260,9 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
const char *hookname, const char **chainname,
const char **comment, unsigned int *rulenum)
{
- const struct ip6t_standard_target *t = (void *)ip6t_get_target_c(s);
+ const struct xt_standard_target *t = (void *)ip6t_get_target_c(s);
- if (strcmp(t->target.u.kernel.target->name, IP6T_ERROR_TARGET) == 0) {
+ if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
/* Head of user chain: ERROR target with chainname */
*chainname = t->target.data;
(*rulenum) = 0;
@@ -271,7 +271,7 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
if (s->target_offset == sizeof(struct ip6t_entry) &&
strcmp(t->target.u.kernel.target->name,
- IP6T_STANDARD_TARGET) == 0 &&
+ XT_STANDARD_TARGET) == 0 &&
t->verdict < 0 &&
unconditional(&s->ipv6)) {
/* Tail of chains: STANDARD target (return/policy) */
@@ -369,7 +369,7 @@ ip6t_do_table(struct sk_buff *skb,
e = get_entry(table_base, private->hook_entry[hook]);
do {
- const struct ip6t_entry_target *t;
+ const struct xt_entry_target *t;
const struct xt_entry_match *ematch;
IP_NF_ASSERT(e);
@@ -403,10 +403,10 @@ ip6t_do_table(struct sk_buff *skb,
if (!t->u.kernel.target->target) {
int v;
- v = ((struct ip6t_standard_target *)t)->verdict;
+ v = ((struct xt_standard_target *)t)->verdict;
if (v < 0) {
/* Pop from stack? */
- if (v != IP6T_RETURN) {
+ if (v != XT_RETURN) {
verdict = (unsigned)(-v) - 1;
break;
}
@@ -434,7 +434,7 @@ ip6t_do_table(struct sk_buff *skb,
acpar.targinfo = t->data;
verdict = t->u.kernel.target->target(skb, &acpar);
- if (verdict == IP6T_CONTINUE)
+ if (verdict == XT_CONTINUE)
e = ip6t_next_entry(e);
else
/* Verdict */
@@ -474,7 +474,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
e->counters.pcnt = pos;
for (;;) {
- const struct ip6t_standard_target *t
+ const struct xt_standard_target *t
= (void *)ip6t_get_target_c(e);
int visited = e->comefrom & (1 << hook);
@@ -488,13 +488,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
/* Unconditional return/END. */
if ((e->target_offset == sizeof(struct ip6t_entry) &&
(strcmp(t->target.u.user.name,
- IP6T_STANDARD_TARGET) == 0) &&
+ XT_STANDARD_TARGET) == 0) &&
t->verdict < 0 &&
unconditional(&e->ipv6)) || visited) {
unsigned int oldpos, size;
if ((strcmp(t->target.u.user.name,
- IP6T_STANDARD_TARGET) == 0) &&
+ XT_STANDARD_TARGET) == 0) &&
t->verdict < -NF_MAX_VERDICT - 1) {
duprintf("mark_source_chains: bad "
"negative verdict (%i)\n",
@@ -537,7 +537,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
int newpos = t->verdict;
if (strcmp(t->target.u.user.name,
- IP6T_STANDARD_TARGET) == 0 &&
+ XT_STANDARD_TARGET) == 0 &&
newpos >= 0) {
if (newpos > newinfo->size -
sizeof(struct ip6t_entry)) {
@@ -565,7 +565,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
return 1;
}
-static void cleanup_match(struct ip6t_entry_match *m, struct net *net)
+static void cleanup_match(struct xt_entry_match *m, struct net *net)
{
struct xt_mtdtor_param par;
@@ -581,14 +581,14 @@ static void cleanup_match(struct ip6t_entry_match *m, struct net *net)
static int
check_entry(const struct ip6t_entry *e, const char *name)
{
- const struct ip6t_entry_target *t;
+ const struct xt_entry_target *t;
if (!ip6_checkentry(&e->ipv6)) {
duprintf("ip_tables: ip check failed %p %s.\n", e, name);
return -EINVAL;
}
- if (e->target_offset + sizeof(struct ip6t_entry_target) >
+ if (e->target_offset + sizeof(struct xt_entry_target) >
e->next_offset)
return -EINVAL;
@@ -599,7 +599,7 @@ check_entry(const struct ip6t_entry *e, const char *name)
return 0;
}
-static int check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par)
+static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
const struct ip6t_ip6 *ipv6 = par->entryinfo;
int ret;
@@ -618,7 +618,7 @@ static int check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par)
}
static int
-find_check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par)
+find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
struct xt_match *match;
int ret;
@@ -643,7 +643,7 @@ err:
static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
{
- struct ip6t_entry_target *t = ip6t_get_target(e);
+ struct xt_entry_target *t = ip6t_get_target(e);
struct xt_tgchk_param par = {
.net = net,
.table = name,
@@ -670,7 +670,7 @@ static int
find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
unsigned int size)
{
- struct ip6t_entry_target *t;
+ struct xt_entry_target *t;
struct xt_target *target;
int ret;
unsigned int j;
@@ -721,7 +721,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
static bool check_underflow(const struct ip6t_entry *e)
{
- const struct ip6t_entry_target *t;
+ const struct xt_entry_target *t;
unsigned int verdict;
if (!unconditional(&e->ipv6))
@@ -729,7 +729,7 @@ static bool check_underflow(const struct ip6t_entry *e)
t = ip6t_get_target_c(e);
if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
return false;
- verdict = ((struct ip6t_standard_target *)t)->verdict;
+ verdict = ((struct xt_standard_target *)t)->verdict;
verdict = -verdict - 1;
return verdict == NF_DROP || verdict == NF_ACCEPT;
}
@@ -752,7 +752,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
}
if (e->next_offset
- < sizeof(struct ip6t_entry) + sizeof(struct ip6t_entry_target)) {
+ < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) {
duprintf("checking: element %p size %u\n",
e, e->next_offset);
return -EINVAL;
@@ -784,7 +784,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
static void cleanup_entry(struct ip6t_entry *e, struct net *net)
{
struct xt_tgdtor_param par;
- struct ip6t_entry_target *t;
+ struct xt_entry_target *t;
struct xt_entry_match *ematch;
/* Cleanup all matches */
@@ -922,6 +922,7 @@ get_counters(const struct xt_table_info *t,
if (cpu == curcpu)
continue;
i = 0;
+ local_bh_disable();
xt_info_wrlock(cpu);
xt_entry_foreach(iter, t->entries[cpu], t->size) {
ADD_COUNTER(counters[i], iter->counters.bcnt,
@@ -929,6 +930,7 @@ get_counters(const struct xt_table_info *t,
++i;
}
xt_info_wrunlock(cpu);
+ local_bh_enable();
}
put_cpu();
}
@@ -983,8 +985,8 @@ copy_entries_to_user(unsigned int total_size,
/* ... then go back and fix counters and names */
for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
unsigned int i;
- const struct ip6t_entry_match *m;
- const struct ip6t_entry_target *t;
+ const struct xt_entry_match *m;
+ const struct xt_entry_target *t;
e = (struct ip6t_entry *)(loc_cpu_entry + off);
if (copy_to_user(userptr + off
@@ -1001,7 +1003,7 @@ copy_entries_to_user(unsigned int total_size,
m = (void *)e + i;
if (copy_to_user(userptr + off + i
- + offsetof(struct ip6t_entry_match,
+ + offsetof(struct xt_entry_match,
u.user.name),
m->u.kernel.match->name,
strlen(m->u.kernel.match->name)+1)
@@ -1013,7 +1015,7 @@ copy_entries_to_user(unsigned int total_size,
t = ip6t_get_target_c(e);
if (copy_to_user(userptr + off + e->target_offset
- + offsetof(struct ip6t_entry_target,
+ + offsetof(struct xt_entry_target,
u.user.name),
t->u.kernel.target->name,
strlen(t->u.kernel.target->name)+1) != 0) {
@@ -1051,7 +1053,7 @@ static int compat_calc_entry(const struct ip6t_entry *e,
const void *base, struct xt_table_info *newinfo)
{
const struct xt_entry_match *ematch;
- const struct ip6t_entry_target *t;
+ const struct xt_entry_target *t;
unsigned int entry_offset;
int off, i, ret;
@@ -1103,7 +1105,7 @@ static int compat_table_info(const struct xt_table_info *info,
static int get_info(struct net *net, void __user *user,
const int *len, int compat)
{
- char name[IP6T_TABLE_MAXNAMELEN];
+ char name[XT_TABLE_MAXNAMELEN];
struct xt_table *t;
int ret;
@@ -1116,7 +1118,7 @@ static int get_info(struct net *net, void __user *user,
if (copy_from_user(name, user, sizeof(name)) != 0)
return -EFAULT;
- name[IP6T_TABLE_MAXNAMELEN-1] = '\0';
+ name[XT_TABLE_MAXNAMELEN-1] = '\0';
#ifdef CONFIG_COMPAT
if (compat)
xt_compat_lock(AF_INET6);
@@ -1413,14 +1415,14 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
#ifdef CONFIG_COMPAT
struct compat_ip6t_replace {
- char name[IP6T_TABLE_MAXNAMELEN];
+ char name[XT_TABLE_MAXNAMELEN];
u32 valid_hooks;
u32 num_entries;
u32 size;
u32 hook_entry[NF_INET_NUMHOOKS];
u32 underflow[NF_INET_NUMHOOKS];
u32 num_counters;
- compat_uptr_t counters; /* struct ip6t_counters * */
+ compat_uptr_t counters; /* struct xt_counters * */
struct compat_ip6t_entry entries[0];
};
@@ -1429,7 +1431,7 @@ compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr,
unsigned int *size, struct xt_counters *counters,
unsigned int i)
{
- struct ip6t_entry_target *t;
+ struct xt_entry_target *t;
struct compat_ip6t_entry __user *ce;
u_int16_t target_offset, next_offset;
compat_uint_t origsize;
@@ -1464,7 +1466,7 @@ compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr,
}
static int
-compat_find_calc_match(struct ip6t_entry_match *m,
+compat_find_calc_match(struct xt_entry_match *m,
const char *name,
const struct ip6t_ip6 *ipv6,
unsigned int hookmask,
@@ -1486,7 +1488,7 @@ compat_find_calc_match(struct ip6t_entry_match *m,
static void compat_release_entry(struct compat_ip6t_entry *e)
{
- struct ip6t_entry_target *t;
+ struct xt_entry_target *t;
struct xt_entry_match *ematch;
/* Cleanup all matches */
@@ -1507,7 +1509,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
const char *name)
{
struct xt_entry_match *ematch;
- struct ip6t_entry_target *t;
+ struct xt_entry_target *t;
struct xt_target *target;
unsigned int entry_offset;
unsigned int j;
@@ -1589,7 +1591,7 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
unsigned int *size, const char *name,
struct xt_table_info *newinfo, unsigned char *base)
{
- struct ip6t_entry_target *t;
+ struct xt_entry_target *t;
struct xt_target *target;
struct ip6t_entry *de;
unsigned int origsize;
@@ -1764,6 +1766,9 @@ translate_compat_table(struct net *net,
if (ret != 0)
break;
++i;
+ if (strcmp(ip6t_get_target(iter1)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
}
if (ret) {
/*
@@ -1894,7 +1899,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user,
}
struct compat_ip6t_get_entries {
- char name[IP6T_TABLE_MAXNAMELEN];
+ char name[XT_TABLE_MAXNAMELEN];
compat_uint_t size;
struct compat_ip6t_entry entrytable[0];
};
@@ -2049,7 +2054,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
case IP6T_SO_GET_REVISION_MATCH:
case IP6T_SO_GET_REVISION_TARGET: {
- struct ip6t_get_revision rev;
+ struct xt_get_revision rev;
int target;
if (*len != sizeof(rev)) {
@@ -2186,7 +2191,7 @@ static int icmp6_checkentry(const struct xt_mtchk_param *par)
/* The built-in targets: standard (NULL) and error. */
static struct xt_target ip6t_builtin_tg[] __read_mostly = {
{
- .name = IP6T_STANDARD_TARGET,
+ .name = XT_STANDARD_TARGET,
.targetsize = sizeof(int),
.family = NFPROTO_IPV6,
#ifdef CONFIG_COMPAT
@@ -2196,9 +2201,9 @@ static struct xt_target ip6t_builtin_tg[] __read_mostly = {
#endif
},
{
- .name = IP6T_ERROR_TARGET,
+ .name = XT_ERROR_TARGET,
.target = ip6t_error,
- .targetsize = IP6T_FUNCTION_MAXNAMELEN,
+ .targetsize = XT_FUNCTION_MAXNAMELEN,
.family = NFPROTO_IPV6,
},
};
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 0a07ae7b933..09c88891a75 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -23,6 +23,7 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <net/netfilter/nf_log.h>
+#include <net/netfilter/xt_log.h>
MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>");
MODULE_DESCRIPTION("Xtables: IPv6 packet logging to syslog");
@@ -32,11 +33,9 @@ struct in_device;
#include <net/route.h>
#include <linux/netfilter_ipv6/ip6t_LOG.h>
-/* Use lock to serialize, so printks don't overlap */
-static DEFINE_SPINLOCK(log_lock);
-
/* One level of recursion won't kill us */
-static void dump_packet(const struct nf_loginfo *info,
+static void dump_packet(struct sbuff *m,
+ const struct nf_loginfo *info,
const struct sk_buff *skb, unsigned int ip6hoff,
int recurse)
{
@@ -55,15 +54,15 @@ static void dump_packet(const struct nf_loginfo *info,
ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
if (ih == NULL) {
- printk("TRUNCATED");
+ sb_add(m, "TRUNCATED");
return;
}
/* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
- printk("SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
+ sb_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
/* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
- printk("LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
+ sb_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
(ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
ih->hop_limit,
@@ -78,35 +77,35 @@ static void dump_packet(const struct nf_loginfo *info,
hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
if (hp == NULL) {
- printk("TRUNCATED");
+ sb_add(m, "TRUNCATED");
return;
}
/* Max length: 48 "OPT (...) " */
if (logflags & IP6T_LOG_IPOPT)
- printk("OPT ( ");
+ sb_add(m, "OPT ( ");
switch (currenthdr) {
case IPPROTO_FRAGMENT: {
struct frag_hdr _fhdr;
const struct frag_hdr *fh;
- printk("FRAG:");
+ sb_add(m, "FRAG:");
fh = skb_header_pointer(skb, ptr, sizeof(_fhdr),
&_fhdr);
if (fh == NULL) {
- printk("TRUNCATED ");
+ sb_add(m, "TRUNCATED ");
return;
}
/* Max length: 6 "65535 " */
- printk("%u ", ntohs(fh->frag_off) & 0xFFF8);
+ sb_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8);
/* Max length: 11 "INCOMPLETE " */
if (fh->frag_off & htons(0x0001))
- printk("INCOMPLETE ");
+ sb_add(m, "INCOMPLETE ");
- printk("ID:%08x ", ntohl(fh->identification));
+ sb_add(m, "ID:%08x ", ntohl(fh->identification));
if (ntohs(fh->frag_off) & 0xFFF8)
fragment = 1;
@@ -120,7 +119,7 @@ static void dump_packet(const struct nf_loginfo *info,
case IPPROTO_HOPOPTS:
if (fragment) {
if (logflags & IP6T_LOG_IPOPT)
- printk(")");
+ sb_add(m, ")");
return;
}
hdrlen = ipv6_optlen(hp);
@@ -132,10 +131,10 @@ static void dump_packet(const struct nf_loginfo *info,
const struct ip_auth_hdr *ah;
/* Max length: 3 "AH " */
- printk("AH ");
+ sb_add(m, "AH ");
if (fragment) {
- printk(")");
+ sb_add(m, ")");
return;
}
@@ -146,13 +145,13 @@ static void dump_packet(const struct nf_loginfo *info,
* Max length: 26 "INCOMPLETE [65535
* bytes] )"
*/
- printk("INCOMPLETE [%u bytes] )",
+ sb_add(m, "INCOMPLETE [%u bytes] )",
skb->len - ptr);
return;
}
/* Length: 15 "SPI=0xF1234567 */
- printk("SPI=0x%x ", ntohl(ah->spi));
+ sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
}
@@ -164,10 +163,10 @@ static void dump_packet(const struct nf_loginfo *info,
const struct ip_esp_hdr *eh;
/* Max length: 4 "ESP " */
- printk("ESP ");
+ sb_add(m, "ESP ");
if (fragment) {
- printk(")");
+ sb_add(m, ")");
return;
}
@@ -177,23 +176,23 @@ static void dump_packet(const struct nf_loginfo *info,
eh = skb_header_pointer(skb, ptr, sizeof(_esph),
&_esph);
if (eh == NULL) {
- printk("INCOMPLETE [%u bytes] )",
+ sb_add(m, "INCOMPLETE [%u bytes] )",
skb->len - ptr);
return;
}
/* Length: 16 "SPI=0xF1234567 )" */
- printk("SPI=0x%x )", ntohl(eh->spi) );
+ sb_add(m, "SPI=0x%x )", ntohl(eh->spi) );
}
return;
default:
/* Max length: 20 "Unknown Ext Hdr 255" */
- printk("Unknown Ext Hdr %u", currenthdr);
+ sb_add(m, "Unknown Ext Hdr %u", currenthdr);
return;
}
if (logflags & IP6T_LOG_IPOPT)
- printk(") ");
+ sb_add(m, ") ");
currenthdr = hp->nexthdr;
ptr += hdrlen;
@@ -205,7 +204,7 @@ static void dump_packet(const struct nf_loginfo *info,
const struct tcphdr *th;
/* Max length: 10 "PROTO=TCP " */
- printk("PROTO=TCP ");
+ sb_add(m, "PROTO=TCP ");
if (fragment)
break;
@@ -213,40 +212,40 @@ static void dump_packet(const struct nf_loginfo *info,
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
th = skb_header_pointer(skb, ptr, sizeof(_tcph), &_tcph);
if (th == NULL) {
- printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
+ sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
return;
}
/* Max length: 20 "SPT=65535 DPT=65535 " */
- printk("SPT=%u DPT=%u ",
+ sb_add(m, "SPT=%u DPT=%u ",
ntohs(th->source), ntohs(th->dest));
/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
if (logflags & IP6T_LOG_TCPSEQ)
- printk("SEQ=%u ACK=%u ",
+ sb_add(m, "SEQ=%u ACK=%u ",
ntohl(th->seq), ntohl(th->ack_seq));
/* Max length: 13 "WINDOW=65535 " */
- printk("WINDOW=%u ", ntohs(th->window));
+ sb_add(m, "WINDOW=%u ", ntohs(th->window));
/* Max length: 9 "RES=0x3C " */
- printk("RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+ sb_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
if (th->cwr)
- printk("CWR ");
+ sb_add(m, "CWR ");
if (th->ece)
- printk("ECE ");
+ sb_add(m, "ECE ");
if (th->urg)
- printk("URG ");
+ sb_add(m, "URG ");
if (th->ack)
- printk("ACK ");
+ sb_add(m, "ACK ");
if (th->psh)
- printk("PSH ");
+ sb_add(m, "PSH ");
if (th->rst)
- printk("RST ");
+ sb_add(m, "RST ");
if (th->syn)
- printk("SYN ");
+ sb_add(m, "SYN ");
if (th->fin)
- printk("FIN ");
+ sb_add(m, "FIN ");
/* Max length: 11 "URGP=65535 " */
- printk("URGP=%u ", ntohs(th->urg_ptr));
+ sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
if ((logflags & IP6T_LOG_TCPOPT) &&
th->doff * 4 > sizeof(struct tcphdr)) {
@@ -260,15 +259,15 @@ static void dump_packet(const struct nf_loginfo *info,
ptr + sizeof(struct tcphdr),
optsize, _opt);
if (op == NULL) {
- printk("OPT (TRUNCATED)");
+ sb_add(m, "OPT (TRUNCATED)");
return;
}
/* Max length: 127 "OPT (" 15*4*2chars ") " */
- printk("OPT (");
+ sb_add(m, "OPT (");
for (i =0; i < optsize; i++)
- printk("%02X", op[i]);
- printk(") ");
+ sb_add(m, "%02X", op[i]);
+ sb_add(m, ") ");
}
break;
}
@@ -279,9 +278,9 @@ static void dump_packet(const struct nf_loginfo *info,
if (currenthdr == IPPROTO_UDP)
/* Max length: 10 "PROTO=UDP " */
- printk("PROTO=UDP " );
+ sb_add(m, "PROTO=UDP " );
else /* Max length: 14 "PROTO=UDPLITE " */
- printk("PROTO=UDPLITE ");
+ sb_add(m, "PROTO=UDPLITE ");
if (fragment)
break;
@@ -289,12 +288,12 @@ static void dump_packet(const struct nf_loginfo *info,
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
uh = skb_header_pointer(skb, ptr, sizeof(_udph), &_udph);
if (uh == NULL) {
- printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
+ sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
return;
}
/* Max length: 20 "SPT=65535 DPT=65535 " */
- printk("SPT=%u DPT=%u LEN=%u ",
+ sb_add(m, "SPT=%u DPT=%u LEN=%u ",
ntohs(uh->source), ntohs(uh->dest),
ntohs(uh->len));
break;
@@ -304,7 +303,7 @@ static void dump_packet(const struct nf_loginfo *info,
const struct icmp6hdr *ic;
/* Max length: 13 "PROTO=ICMPv6 " */
- printk("PROTO=ICMPv6 ");
+ sb_add(m, "PROTO=ICMPv6 ");
if (fragment)
break;
@@ -312,18 +311,18 @@ static void dump_packet(const struct nf_loginfo *info,
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h);
if (ic == NULL) {
- printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
+ sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
return;
}
/* Max length: 18 "TYPE=255 CODE=255 " */
- printk("TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code);
+ sb_add(m, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code);
switch (ic->icmp6_type) {
case ICMPV6_ECHO_REQUEST:
case ICMPV6_ECHO_REPLY:
/* Max length: 19 "ID=65535 SEQ=65535 " */
- printk("ID=%u SEQ=%u ",
+ sb_add(m, "ID=%u SEQ=%u ",
ntohs(ic->icmp6_identifier),
ntohs(ic->icmp6_sequence));
break;
@@ -334,35 +333,35 @@ static void dump_packet(const struct nf_loginfo *info,
case ICMPV6_PARAMPROB:
/* Max length: 17 "POINTER=ffffffff " */
- printk("POINTER=%08x ", ntohl(ic->icmp6_pointer));
+ sb_add(m, "POINTER=%08x ", ntohl(ic->icmp6_pointer));
/* Fall through */
case ICMPV6_DEST_UNREACH:
case ICMPV6_PKT_TOOBIG:
case ICMPV6_TIME_EXCEED:
/* Max length: 3+maxlen */
if (recurse) {
- printk("[");
- dump_packet(info, skb, ptr + sizeof(_icmp6h),
- 0);
- printk("] ");
+ sb_add(m, "[");
+ dump_packet(m, info, skb,
+ ptr + sizeof(_icmp6h), 0);
+ sb_add(m, "] ");
}
/* Max length: 10 "MTU=65535 " */
if (ic->icmp6_type == ICMPV6_PKT_TOOBIG)
- printk("MTU=%u ", ntohl(ic->icmp6_mtu));
+ sb_add(m, "MTU=%u ", ntohl(ic->icmp6_mtu));
}
break;
}
/* Max length: 10 "PROTO=255 " */
default:
- printk("PROTO=%u ", currenthdr);
+ sb_add(m, "PROTO=%u ", currenthdr);
}
/* Max length: 15 "UID=4294967295 " */
if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) {
read_lock_bh(&skb->sk->sk_callback_lock);
if (skb->sk->sk_socket && skb->sk->sk_socket->file)
- printk("UID=%u GID=%u ",
+ sb_add(m, "UID=%u GID=%u ",
skb->sk->sk_socket->file->f_cred->fsuid,
skb->sk->sk_socket->file->f_cred->fsgid);
read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -370,10 +369,11 @@ static void dump_packet(const struct nf_loginfo *info,
/* Max length: 16 "MARK=0xFFFFFFFF " */
if (!recurse && skb->mark)
- printk("MARK=0x%x ", skb->mark);
+ sb_add(m, "MARK=0x%x ", skb->mark);
}
-static void dump_mac_header(const struct nf_loginfo *info,
+static void dump_mac_header(struct sbuff *m,
+ const struct nf_loginfo *info,
const struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
@@ -387,7 +387,7 @@ static void dump_mac_header(const struct nf_loginfo *info,
switch (dev->type) {
case ARPHRD_ETHER:
- printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+ sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
ntohs(eth_hdr(skb)->h_proto));
return;
@@ -396,7 +396,7 @@ static void dump_mac_header(const struct nf_loginfo *info,
}
fallback:
- printk("MAC=");
+ sb_add(m, "MAC=");
if (dev->hard_header_len &&
skb->mac_header != skb->network_header) {
const unsigned char *p = skb_mac_header(skb);
@@ -408,19 +408,19 @@ fallback:
p = NULL;
if (p != NULL) {
- printk("%02x", *p++);
+ sb_add(m, "%02x", *p++);
for (i = 1; i < len; i++)
- printk(":%02x", p[i]);
+ sb_add(m, ":%02x", p[i]);
}
- printk(" ");
+ sb_add(m, " ");
if (dev->type == ARPHRD_SIT) {
const struct iphdr *iph =
(struct iphdr *)skb_mac_header(skb);
- printk("TUNNEL=%pI4->%pI4 ", &iph->saddr, &iph->daddr);
+ sb_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, &iph->daddr);
}
} else
- printk(" ");
+ sb_add(m, " ");
}
static struct nf_loginfo default_loginfo = {
@@ -442,22 +442,23 @@ ip6t_log_packet(u_int8_t pf,
const struct nf_loginfo *loginfo,
const char *prefix)
{
+ struct sbuff *m = sb_open();
+
if (!loginfo)
loginfo = &default_loginfo;
- spin_lock_bh(&log_lock);
- printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
- prefix,
- in ? in->name : "",
- out ? out->name : "");
+ sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+ prefix,
+ in ? in->name : "",
+ out ? out->name : "");
/* MAC logging for input path only. */
if (in && !out)
- dump_mac_header(loginfo, skb);
+ dump_mac_header(m, loginfo, skb);
+
+ dump_packet(m, loginfo, skb, skb_network_offset(skb), 1);
- dump_packet(loginfo, skb, skb_network_offset(skb), 1);
- printk("\n");
- spin_unlock_bh(&log_lock);
+ sb_close(m);
}
static unsigned int
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index ff43461704b..c8af58b2256 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -16,7 +16,6 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/icmp.h>
-#include <linux/sysctl.h>
#include <net/ipv6.h>
#include <net/inet_frag.h>
@@ -29,6 +28,7 @@
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <net/netfilter/nf_log.h>
static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
@@ -189,53 +189,6 @@ out:
return nf_conntrack_confirm(skb);
}
-static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
- struct sk_buff *skb)
-{
- u16 zone = NF_CT_DEFAULT_ZONE;
-
- if (skb->nfct)
- zone = nf_ct_zone((struct nf_conn *)skb->nfct);
-
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (skb->nf_bridge &&
- skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
- return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
-#endif
- if (hooknum == NF_INET_PRE_ROUTING)
- return IP6_DEFRAG_CONNTRACK_IN + zone;
- else
- return IP6_DEFRAG_CONNTRACK_OUT + zone;
-
-}
-
-static unsigned int ipv6_defrag(unsigned int hooknum,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- struct sk_buff *reasm;
-
- /* Previously seen (loopback)? */
- if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
- return NF_ACCEPT;
-
- reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb));
- /* queued */
- if (reasm == NULL)
- return NF_STOLEN;
-
- /* error occured or not fragmented */
- if (reasm == skb)
- return NF_ACCEPT;
-
- nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in,
- (struct net_device *)out, okfn);
-
- return NF_STOLEN;
-}
-
static unsigned int __ipv6_conntrack_in(struct net *net,
unsigned int hooknum,
struct sk_buff *skb,
@@ -288,13 +241,6 @@ static unsigned int ipv6_conntrack_local(unsigned int hooknum,
static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
{
- .hook = ipv6_defrag,
- .owner = THIS_MODULE,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_PRE_ROUTING,
- .priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
- },
- {
.hook = ipv6_conntrack_in,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
@@ -309,13 +255,6 @@ static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
.priority = NF_IP6_PRI_CONNTRACK,
},
{
- .hook = ipv6_defrag,
- .owner = THIS_MODULE,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_LOCAL_OUT,
- .priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
- },
- {
.hook = ipv6_confirm,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV6,
@@ -387,10 +326,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
.nlattr_to_tuple = ipv6_nlattr_to_tuple,
.nla_policy = ipv6_nla_policy,
#endif
-#ifdef CONFIG_SYSCTL
- .ctl_table_path = nf_net_netfilter_sysctl_path,
- .ctl_table = nf_ct_ipv6_sysctl_table,
-#endif
.me = THIS_MODULE,
};
@@ -403,16 +338,12 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
int ret = 0;
need_conntrack();
+ nf_defrag_ipv6_enable();
- ret = nf_ct_frag6_init();
- if (ret < 0) {
- pr_err("nf_conntrack_ipv6: can't initialize frag6.\n");
- return ret;
- }
ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp6);
if (ret < 0) {
pr_err("nf_conntrack_ipv6: can't register tcp.\n");
- goto cleanup_frag6;
+ return ret;
}
ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp6);
@@ -450,8 +381,6 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6);
cleanup_tcp:
nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
- cleanup_frag6:
- nf_ct_frag6_cleanup();
return ret;
}
@@ -463,7 +392,6 @@ static void __exit nf_conntrack_l3proto_ipv6_fini(void)
nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6);
nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
- nf_ct_frag6_cleanup();
}
module_init(nf_conntrack_l3proto_ipv6_init);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 13ef5bc05cf..489d71b844a 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -73,7 +73,7 @@ static struct inet_frags nf_frags;
static struct netns_frags nf_init_frags;
#ifdef CONFIG_SYSCTL
-struct ctl_table nf_ct_ipv6_sysctl_table[] = {
+struct ctl_table nf_ct_frag6_sysctl_table[] = {
{
.procname = "nf_conntrack_frag6_timeout",
.data = &nf_init_frags.timeout,
@@ -97,6 +97,8 @@ struct ctl_table nf_ct_ipv6_sysctl_table[] = {
},
{ }
};
+
+static struct ctl_table_header *nf_ct_frag6_sysctl_header;
#endif
static unsigned int nf_hashfn(struct inet_frag_queue *q)
@@ -113,14 +115,6 @@ static void nf_skb_free(struct sk_buff *skb)
kfree_skb(NFCT_FRAG6_CB(skb)->orig);
}
-/* Memory Tracking Functions. */
-static void frag_kfree_skb(struct sk_buff *skb)
-{
- atomic_sub(skb->truesize, &nf_init_frags.mem);
- nf_skb_free(skb);
- kfree_skb(skb);
-}
-
/* Destruction primitives. */
static __inline__ void fq_put(struct nf_ct_frag6_queue *fq)
@@ -282,66 +276,22 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
}
found:
- /* We found where to put this one. Check for overlap with
- * preceding fragment, and, if needed, align things so that
- * any overlaps are eliminated.
- */
- if (prev) {
- int i = (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset;
-
- if (i > 0) {
- offset += i;
- if (end <= offset) {
- pr_debug("overlap\n");
- goto err;
- }
- if (!pskb_pull(skb, i)) {
- pr_debug("Can't pull\n");
- goto err;
- }
- if (skb->ip_summed != CHECKSUM_UNNECESSARY)
- skb->ip_summed = CHECKSUM_NONE;
- }
- }
-
- /* Look for overlap with succeeding segments.
- * If we can merge fragments, do it.
+ /* RFC5722, Section 4:
+ * When reassembling an IPv6 datagram, if
+ * one or more its constituent fragments is determined to be an
+ * overlapping fragment, the entire datagram (and any constituent
+ * fragments, including those not yet received) MUST be silently
+ * discarded.
*/
- while (next && NFCT_FRAG6_CB(next)->offset < end) {
- /* overlap is 'i' bytes */
- int i = end - NFCT_FRAG6_CB(next)->offset;
-
- if (i < next->len) {
- /* Eat head of the next overlapped fragment
- * and leave the loop. The next ones cannot overlap.
- */
- pr_debug("Eat head of the overlapped parts.: %d", i);
- if (!pskb_pull(next, i))
- goto err;
-
- /* next fragment */
- NFCT_FRAG6_CB(next)->offset += i;
- fq->q.meat -= i;
- if (next->ip_summed != CHECKSUM_UNNECESSARY)
- next->ip_summed = CHECKSUM_NONE;
- break;
- } else {
- struct sk_buff *free_it = next;
-
- /* Old fragmnet is completely overridden with
- * new one drop it.
- */
- next = next->next;
- if (prev)
- prev->next = next;
- else
- fq->q.fragments = next;
+ /* Check for overlap with preceding fragment. */
+ if (prev &&
+ (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset > 0)
+ goto discard_fq;
- fq->q.meat -= free_it->len;
- frag_kfree_skb(free_it);
- }
- }
+ /* Look for overlap with succeeding segment. */
+ if (next && NFCT_FRAG6_CB(next)->offset < end)
+ goto discard_fq;
NFCT_FRAG6_CB(skb)->offset = offset;
@@ -371,6 +321,8 @@ found:
write_unlock(&nf_frags.lock);
return 0;
+discard_fq:
+ fq_kill(fq);
err:
return -1;
}
@@ -413,7 +365,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
/* If the first fragment is fragmented itself, we split
* it to two chunks: the first with data and paged part
* and the second, holding only fragments. */
- if (skb_has_frags(head)) {
+ if (skb_has_frag_list(head)) {
struct sk_buff *clone;
int i, plen = 0;
@@ -673,11 +625,21 @@ int nf_ct_frag6_init(void)
inet_frags_init_net(&nf_init_frags);
inet_frags_init(&nf_frags);
+ nf_ct_frag6_sysctl_header = register_sysctl_paths(nf_net_netfilter_sysctl_path,
+ nf_ct_frag6_sysctl_table);
+ if (!nf_ct_frag6_sysctl_header) {
+ inet_frags_fini(&nf_frags);
+ return -ENOMEM;
+ }
+
return 0;
}
void nf_ct_frag6_cleanup(void)
{
+ unregister_sysctl_table(nf_ct_frag6_sysctl_header);
+ nf_ct_frag6_sysctl_header = NULL;
+
inet_frags_fini(&nf_frags);
nf_init_frags.low_thresh = 0;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
new file mode 100644
index 00000000000..99abfb53bab
--- /dev/null
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -0,0 +1,131 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/ipv6.h>
+#include <linux/in6.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/ipv6.h>
+#include <net/inet_frag.h>
+
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_bridge.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+
+static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
+ struct sk_buff *skb)
+{
+ u16 zone = NF_CT_DEFAULT_ZONE;
+
+ if (skb->nfct)
+ zone = nf_ct_zone((struct nf_conn *)skb->nfct);
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+ if (skb->nf_bridge &&
+ skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
+ return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
+#endif
+ if (hooknum == NF_INET_PRE_ROUTING)
+ return IP6_DEFRAG_CONNTRACK_IN + zone;
+ else
+ return IP6_DEFRAG_CONNTRACK_OUT + zone;
+
+}
+
+static unsigned int ipv6_defrag(unsigned int hooknum,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct sk_buff *reasm;
+
+ /* Previously seen (loopback)? */
+ if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
+ return NF_ACCEPT;
+
+ reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb));
+ /* queued */
+ if (reasm == NULL)
+ return NF_STOLEN;
+
+ /* error occured or not fragmented */
+ if (reasm == skb)
+ return NF_ACCEPT;
+
+ nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in,
+ (struct net_device *)out, okfn);
+
+ return NF_STOLEN;
+}
+
+static struct nf_hook_ops ipv6_defrag_ops[] = {
+ {
+ .hook = ipv6_defrag,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
+ },
+ {
+ .hook = ipv6_defrag,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
+ },
+};
+
+static int __init nf_defrag_init(void)
+{
+ int ret = 0;
+
+ ret = nf_ct_frag6_init();
+ if (ret < 0) {
+ pr_err("nf_defrag_ipv6: can't initialize frag6.\n");
+ return ret;
+ }
+ ret = nf_register_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops));
+ if (ret < 0) {
+ pr_err("nf_defrag_ipv6: can't register hooks\n");
+ goto cleanup_frag6;
+ }
+ return ret;
+
+cleanup_frag6:
+ nf_ct_frag6_cleanup();
+ return ret;
+
+}
+
+static void __exit nf_defrag_fini(void)
+{
+ nf_unregister_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops));
+ nf_ct_frag6_cleanup();
+}
+
+void nf_defrag_ipv6_enable(void)
+{
+}
+EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable);
+
+module_init(nf_defrag_init);
+module_exit(nf_defrag_fini);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c
index 1fa3468f0f3..9bb936ae245 100644
--- a/net/ipv6/protocol.c
+++ b/net/ipv6/protocol.c
@@ -25,28 +25,14 @@
#include <linux/spinlock.h>
#include <net/protocol.h>
-const struct inet6_protocol *inet6_protos[MAX_INET_PROTOS];
-static DEFINE_SPINLOCK(inet6_proto_lock);
-
+const struct inet6_protocol *inet6_protos[MAX_INET_PROTOS] __read_mostly;
int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol)
{
- int ret, hash = protocol & (MAX_INET_PROTOS - 1);
-
- spin_lock_bh(&inet6_proto_lock);
-
- if (inet6_protos[hash]) {
- ret = -1;
- } else {
- inet6_protos[hash] = prot;
- ret = 0;
- }
-
- spin_unlock_bh(&inet6_proto_lock);
+ int hash = protocol & (MAX_INET_PROTOS - 1);
- return ret;
+ return !cmpxchg(&inet6_protos[hash], NULL, prot) ? 0 : -1;
}
-
EXPORT_SYMBOL(inet6_add_protocol);
/*
@@ -57,20 +43,10 @@ int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol
{
int ret, hash = protocol & (MAX_INET_PROTOS - 1);
- spin_lock_bh(&inet6_proto_lock);
-
- if (inet6_protos[hash] != prot) {
- ret = -1;
- } else {
- inet6_protos[hash] = NULL;
- ret = 0;
- }
-
- spin_unlock_bh(&inet6_proto_lock);
+ ret = (cmpxchg(&inet6_protos[hash], prot, NULL) == prot) ? 0 : -1;
synchronize_net();
return ret;
}
-
EXPORT_SYMBOL(inet6_del_protocol);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index e677937a07f..45e6efb7f17 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -764,7 +764,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
return -EINVAL;
if (sin6->sin6_family && sin6->sin6_family != AF_INET6)
- return(-EAFNOSUPPORT);
+ return -EAFNOSUPPORT;
/* port is the proto value [0..255] carried in nexthdr */
proto = ntohs(sin6->sin6_port);
@@ -772,10 +772,10 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
if (!proto)
proto = inet->inet_num;
else if (proto != inet->inet_num)
- return(-EINVAL);
+ return -EINVAL;
if (proto > 255)
- return(-EINVAL);
+ return -EINVAL;
daddr = &sin6->sin6_addr;
if (np->sndflow) {
@@ -985,7 +985,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
/* You may get strange result with a positive odd offset;
RFC2292bis agrees with me. */
if (val > 0 && (val&1))
- return(-EINVAL);
+ return -EINVAL;
if (val < 0) {
rp->checksum = 0;
} else {
@@ -997,7 +997,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
break;
default:
- return(-ENOPROTOOPT);
+ return -ENOPROTOOPT;
}
}
@@ -1190,7 +1190,7 @@ static int rawv6_init_sk(struct sock *sk)
default:
break;
}
- return(0);
+ return 0;
}
struct proto rawv6_prot = {
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 545c4141b75..c7ba3149633 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -149,13 +149,6 @@ int ip6_frag_match(struct inet_frag_queue *q, void *a)
}
EXPORT_SYMBOL(ip6_frag_match);
-/* Memory Tracking Functions. */
-static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
-{
- atomic_sub(skb->truesize, &nf->mem);
- kfree_skb(skb);
-}
-
void ip6_frag_init(struct inet_frag_queue *q, void *a)
{
struct frag_queue *fq = container_of(q, struct frag_queue, q);
@@ -346,58 +339,22 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
}
found:
- /* We found where to put this one. Check for overlap with
- * preceding fragment, and, if needed, align things so that
- * any overlaps are eliminated.
+ /* RFC5722, Section 4:
+ * When reassembling an IPv6 datagram, if
+ * one or more its constituent fragments is determined to be an
+ * overlapping fragment, the entire datagram (and any constituent
+ * fragments, including those not yet received) MUST be silently
+ * discarded.
*/
- if (prev) {
- int i = (FRAG6_CB(prev)->offset + prev->len) - offset;
- if (i > 0) {
- offset += i;
- if (end <= offset)
- goto err;
- if (!pskb_pull(skb, i))
- goto err;
- if (skb->ip_summed != CHECKSUM_UNNECESSARY)
- skb->ip_summed = CHECKSUM_NONE;
- }
- }
+ /* Check for overlap with preceding fragment. */
+ if (prev &&
+ (FRAG6_CB(prev)->offset + prev->len) - offset > 0)
+ goto discard_fq;
- /* Look for overlap with succeeding segments.
- * If we can merge fragments, do it.
- */
- while (next && FRAG6_CB(next)->offset < end) {
- int i = end - FRAG6_CB(next)->offset; /* overlap is 'i' bytes */
-
- if (i < next->len) {
- /* Eat head of the next overlapped fragment
- * and leave the loop. The next ones cannot overlap.
- */
- if (!pskb_pull(next, i))
- goto err;
- FRAG6_CB(next)->offset += i; /* next fragment */
- fq->q.meat -= i;
- if (next->ip_summed != CHECKSUM_UNNECESSARY)
- next->ip_summed = CHECKSUM_NONE;
- break;
- } else {
- struct sk_buff *free_it = next;
-
- /* Old fragment is completely overridden with
- * new one drop it.
- */
- next = next->next;
-
- if (prev)
- prev->next = next;
- else
- fq->q.fragments = next;
-
- fq->q.meat -= free_it->len;
- frag_kfree_skb(fq->q.net, free_it);
- }
- }
+ /* Look for overlap with succeeding segment. */
+ if (next && FRAG6_CB(next)->offset < end)
+ goto discard_fq;
FRAG6_CB(skb)->offset = offset;
@@ -436,6 +393,8 @@ found:
write_unlock(&ip6_frags.lock);
return -1;
+discard_fq:
+ fq_kill(fq);
err:
IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
IPSTATS_MIB_REASMFAILS);
@@ -499,7 +458,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
/* If the first fragment is fragmented itself, we split
* it to two chunks: the first with data and paged part
* and the second, holding only fragments. */
- if (skb_has_frags(head)) {
+ if (skb_has_frag_list(head)) {
struct sk_buff *clone;
int i, plen = 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8f2d0400cf8..25661f968f3 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -109,7 +109,6 @@ static struct dst_ops ip6_dst_ops_template = {
.link_failure = ip6_link_failure,
.update_pmtu = ip6_rt_update_pmtu,
.local_out = __ip6_local_out,
- .entries = ATOMIC_INIT(0),
};
static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -122,7 +121,6 @@ static struct dst_ops ip6_dst_blackhole_ops = {
.destroy = ip6_dst_destroy,
.check = ip6_dst_check,
.update_pmtu = ip6_rt_blackhole_update_pmtu,
- .entries = ATOMIC_INIT(0),
};
static struct rt6_info ip6_null_entry_template = {
@@ -217,14 +215,14 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
static __inline__ int rt6_check_expired(const struct rt6_info *rt)
{
- return (rt->rt6i_flags & RTF_EXPIRES &&
- time_after(jiffies, rt->rt6i_expires));
+ return (rt->rt6i_flags & RTF_EXPIRES) &&
+ time_after(jiffies, rt->rt6i_expires);
}
static inline int rt6_need_strict(struct in6_addr *daddr)
{
- return (ipv6_addr_type(daddr) &
- (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK));
+ return ipv6_addr_type(daddr) &
+ (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
}
/*
@@ -440,7 +438,7 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
__func__, match);
net = dev_net(rt0->rt6i_dev);
- return (match ? match : net->ipv6.ip6_null_entry);
+ return match ? match : net->ipv6.ip6_null_entry;
}
#ifdef CONFIG_IPV6_ROUTE_INFO
@@ -670,7 +668,7 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *dad
if (net_ratelimit())
printk(KERN_WARNING
- "Neighbour table overflow.\n");
+ "ipv6: Neighbour table overflow.\n");
dst_free(&rt->dst);
return NULL;
}
@@ -859,7 +857,7 @@ int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl
dst_release(*dstp);
*dstp = new;
- return (new ? 0 : -ENOMEM);
+ return new ? 0 : -ENOMEM;
}
EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
@@ -1058,19 +1056,22 @@ static int ip6_dst_gc(struct dst_ops *ops)
int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
+ int entries;
+ entries = dst_entries_get_fast(ops);
if (time_after(rt_last_gc + rt_min_interval, now) &&
- atomic_read(&ops->entries) <= rt_max_size)
+ entries <= rt_max_size)
goto out;
net->ipv6.ip6_rt_gc_expire++;
fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
net->ipv6.ip6_rt_last_gc = now;
- if (atomic_read(&ops->entries) < ops->gc_thresh)
+ entries = dst_entries_get_slow(ops);
+ if (entries < ops->gc_thresh)
net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
out:
net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
- return (atomic_read(&ops->entries) > rt_max_size);
+ return entries > rt_max_size;
}
/* Clean host part of a prefix. Not necessary in radix tree,
@@ -1169,6 +1170,8 @@ int ip6_route_add(struct fib6_config *cfg)
if (addr_type & IPV6_ADDR_MULTICAST)
rt->dst.input = ip6_mc_input;
+ else if (cfg->fc_flags & RTF_LOCAL)
+ rt->dst.input = ip6_input;
else
rt->dst.input = ip6_forward;
@@ -1190,7 +1193,8 @@ int ip6_route_add(struct fib6_config *cfg)
they would result in kernel looping; promote them to reject routes
*/
if ((cfg->fc_flags & RTF_REJECT) ||
- (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
+ (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
+ && !(cfg->fc_flags&RTF_LOCAL))) {
/* hold loopback dev/idev if we haven't done so. */
if (dev != net->loopback_dev) {
if (dev) {
@@ -1556,14 +1560,13 @@ out:
* i.e. Path MTU discovery
*/
-void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
- struct net_device *dev, u32 pmtu)
+static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
+ struct net *net, u32 pmtu, int ifindex)
{
struct rt6_info *rt, *nrt;
- struct net *net = dev_net(dev);
int allfrag = 0;
- rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
+ rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
if (rt == NULL)
return;
@@ -1631,6 +1634,27 @@ out:
dst_release(&rt->dst);
}
+void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
+ struct net_device *dev, u32 pmtu)
+{
+ struct net *net = dev_net(dev);
+
+ /*
+ * RFC 1981 states that a node "MUST reduce the size of the packets it
+ * is sending along the path" that caused the Packet Too Big message.
+ * Since it's not possible in the general case to determine which
+ * interface was used to send the original packet, we update the MTU
+ * on the interface that will be used to send future packets. We also
+ * update the MTU on the interface that received the Packet Too Big in
+ * case the original packet was forced out that interface with
+ * SO_BINDTODEVICE or similar. This is the next best thing to the
+ * correct behaviour, which would be to update the MTU on all
+ * interfaces.
+ */
+ rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
+ rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
+}
+
/*
* Misc support functions
*/
@@ -2082,6 +2106,9 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
if (rtm->rtm_type == RTN_UNREACHABLE)
cfg->fc_flags |= RTF_REJECT;
+ if (rtm->rtm_type == RTN_LOCAL)
+ cfg->fc_flags |= RTF_LOCAL;
+
cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
cfg->fc_nlinfo.nlh = nlh;
cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
@@ -2202,6 +2229,8 @@ static int rt6_fill_node(struct net *net,
NLA_PUT_U32(skb, RTA_TABLE, table);
if (rt->rt6i_flags&RTF_REJECT)
rtm->rtm_type = RTN_UNREACHABLE;
+ else if (rt->rt6i_flags&RTF_LOCAL)
+ rtm->rtm_type = RTN_LOCAL;
else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
rtm->rtm_type = RTN_LOCAL;
else
@@ -2496,7 +2525,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v)
net->ipv6.rt6_stats->fib_rt_alloc,
net->ipv6.rt6_stats->fib_rt_entries,
net->ipv6.rt6_stats->fib_rt_cache,
- atomic_read(&net->ipv6.ip6_dst_ops.entries),
+ dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
net->ipv6.rt6_stats->fib_discarded_routes);
return 0;
@@ -2580,7 +2609,7 @@ ctl_table ipv6_route_table_template[] = {
.data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
+ .proc_handler = proc_dointvec,
},
{
.procname = "mtu_expires",
@@ -2594,7 +2623,7 @@ ctl_table ipv6_route_table_template[] = {
.data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
+ .proc_handler = proc_dointvec,
},
{
.procname = "gc_min_interval_ms",
@@ -2638,11 +2667,14 @@ static int __net_init ip6_route_net_init(struct net *net)
memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
sizeof(net->ipv6.ip6_dst_ops));
+ if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
+ goto out_ip6_dst_ops;
+
net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
sizeof(*net->ipv6.ip6_null_entry),
GFP_KERNEL);
if (!net->ipv6.ip6_null_entry)
- goto out_ip6_dst_ops;
+ goto out_ip6_dst_entries;
net->ipv6.ip6_null_entry->dst.path =
(struct dst_entry *)net->ipv6.ip6_null_entry;
net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
@@ -2692,6 +2724,8 @@ out_ip6_prohibit_entry:
out_ip6_null_entry:
kfree(net->ipv6.ip6_null_entry);
#endif
+out_ip6_dst_entries:
+ dst_entries_destroy(&net->ipv6.ip6_dst_ops);
out_ip6_dst_ops:
goto out;
}
@@ -2730,10 +2764,14 @@ int __init ip6_route_init(void)
if (!ip6_dst_ops_template.kmem_cachep)
goto out;
- ret = register_pernet_subsys(&ip6_route_net_ops);
+ ret = dst_entries_init(&ip6_dst_blackhole_ops);
if (ret)
goto out_kmem_cache;
+ ret = register_pernet_subsys(&ip6_route_net_ops);
+ if (ret)
+ goto out_dst_entries;
+
ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
/* Registering of the loopback is done before this portion of code,
@@ -2780,6 +2818,8 @@ out_fib6_init:
fib6_gc_cleanup();
out_register_subsys:
unregister_pernet_subsys(&ip6_route_net_ops);
+out_dst_entries:
+ dst_entries_destroy(&ip6_dst_blackhole_ops);
out_kmem_cache:
kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
goto out;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 4699cd3c311..367a6cc584c 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -63,36 +63,63 @@
#define HASH_SIZE 16
#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
-static void ipip6_tunnel_init(struct net_device *dev);
+static int ipip6_tunnel_init(struct net_device *dev);
static void ipip6_tunnel_setup(struct net_device *dev);
+static void ipip6_dev_free(struct net_device *dev);
static int sit_net_id __read_mostly;
struct sit_net {
- struct ip_tunnel *tunnels_r_l[HASH_SIZE];
- struct ip_tunnel *tunnels_r[HASH_SIZE];
- struct ip_tunnel *tunnels_l[HASH_SIZE];
- struct ip_tunnel *tunnels_wc[1];
- struct ip_tunnel **tunnels[4];
+ struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_wc[1];
+ struct ip_tunnel __rcu **tunnels[4];
struct net_device *fb_tunnel_dev;
};
/*
- * Locking : hash tables are protected by RCU and a spinlock
+ * Locking : hash tables are protected by RCU and RTNL
*/
-static DEFINE_SPINLOCK(ipip6_lock);
#define for_each_ip_tunnel_rcu(start) \
for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+ unsigned long rx_packets;
+ unsigned long rx_bytes;
+ unsigned long tx_packets;
+ unsigned long tx_bytes;
+};
+
+static struct net_device_stats *ipip6_get_stats(struct net_device *dev)
+{
+ struct pcpu_tstats sum = { 0 };
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+
+ sum.rx_packets += tstats->rx_packets;
+ sum.rx_bytes += tstats->rx_bytes;
+ sum.tx_packets += tstats->tx_packets;
+ sum.tx_bytes += tstats->tx_bytes;
+ }
+ dev->stats.rx_packets = sum.rx_packets;
+ dev->stats.rx_bytes = sum.rx_bytes;
+ dev->stats.tx_packets = sum.tx_packets;
+ dev->stats.tx_bytes = sum.tx_bytes;
+ return &dev->stats;
+}
/*
* Must be invoked with rcu_read_lock
*/
static struct ip_tunnel * ipip6_tunnel_lookup(struct net *net,
struct net_device *dev, __be32 remote, __be32 local)
{
- unsigned h0 = HASH(remote);
- unsigned h1 = HASH(local);
+ unsigned int h0 = HASH(remote);
+ unsigned int h1 = HASH(local);
struct ip_tunnel *t;
struct sit_net *sitn = net_generic(net, sit_net_id);
@@ -121,12 +148,12 @@ static struct ip_tunnel * ipip6_tunnel_lookup(struct net *net,
return NULL;
}
-static struct ip_tunnel **__ipip6_bucket(struct sit_net *sitn,
+static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn,
struct ip_tunnel_parm *parms)
{
__be32 remote = parms->iph.daddr;
__be32 local = parms->iph.saddr;
- unsigned h = 0;
+ unsigned int h = 0;
int prio = 0;
if (remote) {
@@ -140,7 +167,7 @@ static struct ip_tunnel **__ipip6_bucket(struct sit_net *sitn,
return &sitn->tunnels[prio][h];
}
-static inline struct ip_tunnel **ipip6_bucket(struct sit_net *sitn,
+static inline struct ip_tunnel __rcu **ipip6_bucket(struct sit_net *sitn,
struct ip_tunnel *t)
{
return __ipip6_bucket(sitn, &t->parms);
@@ -148,13 +175,14 @@ static inline struct ip_tunnel **ipip6_bucket(struct sit_net *sitn,
static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t)
{
- struct ip_tunnel **tp;
-
- for (tp = ipip6_bucket(sitn, t); *tp; tp = &(*tp)->next) {
- if (t == *tp) {
- spin_lock_bh(&ipip6_lock);
- *tp = t->next;
- spin_unlock_bh(&ipip6_lock);
+ struct ip_tunnel __rcu **tp;
+ struct ip_tunnel *iter;
+
+ for (tp = ipip6_bucket(sitn, t);
+ (iter = rtnl_dereference(*tp)) != NULL;
+ tp = &iter->next) {
+ if (t == iter) {
+ rcu_assign_pointer(*tp, t->next);
break;
}
}
@@ -162,12 +190,10 @@ static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t)
static void ipip6_tunnel_link(struct sit_net *sitn, struct ip_tunnel *t)
{
- struct ip_tunnel **tp = ipip6_bucket(sitn, t);
+ struct ip_tunnel __rcu **tp = ipip6_bucket(sitn, t);
- spin_lock_bh(&ipip6_lock);
- t->next = *tp;
+ rcu_assign_pointer(t->next, rtnl_dereference(*tp));
rcu_assign_pointer(*tp, t);
- spin_unlock_bh(&ipip6_lock);
}
static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn)
@@ -187,17 +213,20 @@ static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn)
#endif
}
-static struct ip_tunnel * ipip6_tunnel_locate(struct net *net,
+static struct ip_tunnel *ipip6_tunnel_locate(struct net *net,
struct ip_tunnel_parm *parms, int create)
{
__be32 remote = parms->iph.daddr;
__be32 local = parms->iph.saddr;
- struct ip_tunnel *t, **tp, *nt;
+ struct ip_tunnel *t, *nt;
+ struct ip_tunnel __rcu **tp;
struct net_device *dev;
char name[IFNAMSIZ];
struct sit_net *sitn = net_generic(net, sit_net_id);
- for (tp = __ipip6_bucket(sitn, parms); (t = *tp) != NULL; tp = &t->next) {
+ for (tp = __ipip6_bucket(sitn, parms);
+ (t = rtnl_dereference(*tp)) != NULL;
+ tp = &t->next) {
if (local == t->parms.iph.saddr &&
remote == t->parms.iph.daddr &&
parms->link == t->parms.link) {
@@ -213,7 +242,7 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct net *net,
if (parms->name[0])
strlcpy(name, parms->name, IFNAMSIZ);
else
- sprintf(name, "sit%%d");
+ strcpy(name, "sit%d");
dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup);
if (dev == NULL)
@@ -229,7 +258,8 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct net *net,
nt = netdev_priv(dev);
nt->parms = *parms;
- ipip6_tunnel_init(dev);
+ if (ipip6_tunnel_init(dev) < 0)
+ goto failed_free;
ipip6_tunnel_clone_6rd(dev, sitn);
if (parms->i_flags & SIT_ISATAP)
@@ -244,7 +274,7 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct net *net,
return nt;
failed_free:
- free_netdev(dev);
+ ipip6_dev_free(dev);
failed:
return NULL;
}
@@ -340,7 +370,7 @@ ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg)
ASSERT_RTNL();
- for (p = t->prl; p; p = p->next) {
+ for (p = rtnl_dereference(t->prl); p; p = rtnl_dereference(p->next)) {
if (p->addr == a->addr) {
if (chg) {
p->flags = a->flags;
@@ -451,15 +481,12 @@ static void ipip6_tunnel_uninit(struct net_device *dev)
struct sit_net *sitn = net_generic(net, sit_net_id);
if (dev == sitn->fb_tunnel_dev) {
- spin_lock_bh(&ipip6_lock);
- sitn->tunnels_wc[0] = NULL;
- spin_unlock_bh(&ipip6_lock);
- dev_put(dev);
+ rcu_assign_pointer(sitn->tunnels_wc[0], NULL);
} else {
ipip6_tunnel_unlink(sitn, netdev_priv(dev));
ipip6_tunnel_del_prl(netdev_priv(dev), NULL);
- dev_put(dev);
}
+ dev_put(dev);
}
@@ -548,6 +575,8 @@ static int ipip6_rcv(struct sk_buff *skb)
tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
iph->saddr, iph->daddr);
if (tunnel != NULL) {
+ struct pcpu_tstats *tstats;
+
secpath_reset(skb);
skb->mac_header = skb->network_header;
skb_reset_network_header(skb);
@@ -563,10 +592,16 @@ static int ipip6_rcv(struct sk_buff *skb)
return 0;
}
- skb_tunnel_rx(skb, tunnel->dev);
+ tstats = this_cpu_ptr(tunnel->dev->tstats);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+
+ __skb_tunnel_rx(skb, tunnel->dev);
ipip6_ecn_decapsulate(iph, skb);
+
netif_rx(skb);
+
rcu_read_unlock();
return 0;
}
@@ -590,7 +625,7 @@ __be32 try_6rd(struct in6_addr *v6dst, struct ip_tunnel *tunnel)
#ifdef CONFIG_IPV6_SIT_6RD
if (ipv6_prefix_equal(v6dst, &tunnel->ip6rd.prefix,
tunnel->ip6rd.prefixlen)) {
- unsigned pbw0, pbi0;
+ unsigned int pbw0, pbi0;
int pbi1;
u32 d;
@@ -625,14 +660,13 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct net_device_stats *stats = &dev->stats;
- struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
+ struct pcpu_tstats *tstats;
struct iphdr *tiph = &tunnel->parms.iph;
struct ipv6hdr *iph6 = ipv6_hdr(skb);
u8 tos = tunnel->parms.iph.tos;
__be16 df = tiph->frag_off;
struct rtable *rt; /* Route to the other host */
- struct net_device *tdev; /* Device to other host */
+ struct net_device *tdev; /* Device to other host */
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
__be32 dst = tiph->daddr;
@@ -703,20 +737,20 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
.oif = tunnel->parms.link,
.proto = IPPROTO_IPV6 };
if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
- stats->tx_carrier_errors++;
+ dev->stats.tx_carrier_errors++;
goto tx_error_icmp;
}
}
if (rt->rt_type != RTN_UNICAST) {
ip_rt_put(rt);
- stats->tx_carrier_errors++;
+ dev->stats.tx_carrier_errors++;
goto tx_error_icmp;
}
tdev = rt->dst.dev;
if (tdev == dev) {
ip_rt_put(rt);
- stats->collisions++;
+ dev->stats.collisions++;
goto tx_error;
}
@@ -724,7 +758,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
if (mtu < 68) {
- stats->collisions++;
+ dev->stats.collisions++;
ip_rt_put(rt);
goto tx_error;
}
@@ -763,7 +797,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
if (!new_skb) {
ip_rt_put(rt);
- txq->tx_dropped++;
+ dev->stats.tx_dropped++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -799,14 +833,14 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
iph->ttl = iph6->hop_limit;
nf_reset(skb);
-
- IPTUNNEL_XMIT();
+ tstats = this_cpu_ptr(dev->tstats);
+ __IPTUNNEL_XMIT(tstats, &dev->stats);
return NETDEV_TX_OK;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
- stats->tx_errors++;
+ dev->stats.tx_errors++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -1083,12 +1117,19 @@ static const struct net_device_ops ipip6_netdev_ops = {
.ndo_start_xmit = ipip6_tunnel_xmit,
.ndo_do_ioctl = ipip6_tunnel_ioctl,
.ndo_change_mtu = ipip6_tunnel_change_mtu,
+ .ndo_get_stats = ipip6_get_stats,
};
+static void ipip6_dev_free(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
static void ipip6_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ipip6_netdev_ops;
- dev->destructor = free_netdev;
+ dev->destructor = ipip6_dev_free;
dev->type = ARPHRD_SIT;
dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
@@ -1098,9 +1139,10 @@ static void ipip6_tunnel_setup(struct net_device *dev)
dev->iflink = 0;
dev->addr_len = 4;
dev->features |= NETIF_F_NETNS_LOCAL;
+ dev->features |= NETIF_F_LLTX;
}
-static void ipip6_tunnel_init(struct net_device *dev)
+static int ipip6_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
@@ -1111,9 +1153,14 @@ static void ipip6_tunnel_init(struct net_device *dev)
memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
ipip6_tunnel_bind_dev(dev);
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ return 0;
}
-static void __net_init ipip6_fb_tunnel_init(struct net_device *dev)
+static int __net_init ipip6_fb_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
struct iphdr *iph = &tunnel->parms.iph;
@@ -1128,11 +1175,15 @@ static void __net_init ipip6_fb_tunnel_init(struct net_device *dev)
iph->ihl = 5;
iph->ttl = 64;
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
dev_hold(dev);
sitn->tunnels_wc[0] = tunnel;
+ return 0;
}
-static struct xfrm_tunnel sit_handler = {
+static struct xfrm_tunnel sit_handler __read_mostly = {
.handler = ipip6_rcv,
.err_handler = ipip6_err,
.priority = 1,
@@ -1173,7 +1224,10 @@ static int __net_init sit_init_net(struct net *net)
}
dev_net_set(sitn->fb_tunnel_dev, net);
- ipip6_fb_tunnel_init(sitn->fb_tunnel_dev);
+ err = ipip6_fb_tunnel_init(sitn->fb_tunnel_dev);
+ if (err)
+ goto err_dev_free;
+
ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn);
if ((err = register_netdev(sitn->fb_tunnel_dev)))
@@ -1183,7 +1237,8 @@ static int __net_init sit_init_net(struct net *net)
err_reg_dev:
dev_put(sitn->fb_tunnel_dev);
- free_netdev(sitn->fb_tunnel_dev);
+err_dev_free:
+ ipip6_dev_free(sitn->fb_tunnel_dev);
err_alloc_dev:
return err;
}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index fe6d40418c0..7e41e2cbb85 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -139,7 +139,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
return -EINVAL;
if (usin->sin6_family != AF_INET6)
- return(-EAFNOSUPPORT);
+ return -EAFNOSUPPORT;
memset(&fl, 0, sizeof(fl));
@@ -1409,7 +1409,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newsk = tcp_create_openreq_child(sk, req, skb);
if (newsk == NULL)
- goto out;
+ goto out_nonewsk;
/*
* No need to charge this sock to the relevant IPv6 refcnt debug socks
@@ -1497,18 +1497,22 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
}
#endif
+ if (__inet_inherit_port(sk, newsk) < 0) {
+ sock_put(newsk);
+ goto out;
+ }
__inet6_hash(newsk, NULL);
- __inet_inherit_port(sk, newsk);
return newsk;
out_overflow:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
-out:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+out_nonewsk:
if (opt && opt != np->opt)
sock_kfree_s(sk, opt, opt->tot_len);
dst_release(dst);
+out:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
return NULL;
}
diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
index fc3c86a4745..d9864725d0c 100644
--- a/net/ipv6/tunnel6.c
+++ b/net/ipv6/tunnel6.c
@@ -30,8 +30,8 @@
#include <net/protocol.h>
#include <net/xfrm.h>
-static struct xfrm6_tunnel *tunnel6_handlers;
-static struct xfrm6_tunnel *tunnel46_handlers;
+static struct xfrm6_tunnel *tunnel6_handlers __read_mostly;
+static struct xfrm6_tunnel *tunnel46_handlers __read_mostly;
static DEFINE_MUTEX(tunnel6_mutex);
int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family)
@@ -51,7 +51,7 @@ int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family)
}
handler->next = *pprev;
- *pprev = handler;
+ rcu_assign_pointer(*pprev, handler);
ret = 0;
@@ -88,6 +88,11 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
EXPORT_SYMBOL(xfrm6_tunnel_deregister);
+#define for_each_tunnel_rcu(head, handler) \
+ for (handler = rcu_dereference(head); \
+ handler != NULL; \
+ handler = rcu_dereference(handler->next)) \
+
static int tunnel6_rcv(struct sk_buff *skb)
{
struct xfrm6_tunnel *handler;
@@ -95,7 +100,7 @@ static int tunnel6_rcv(struct sk_buff *skb)
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
goto drop;
- for (handler = tunnel6_handlers; handler; handler = handler->next)
+ for_each_tunnel_rcu(tunnel6_handlers, handler)
if (!handler->handler(skb))
return 0;
@@ -113,7 +118,7 @@ static int tunnel46_rcv(struct sk_buff *skb)
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto drop;
- for (handler = tunnel46_handlers; handler; handler = handler->next)
+ for_each_tunnel_rcu(tunnel46_handlers, handler)
if (!handler->handler(skb))
return 0;
@@ -129,7 +134,7 @@ static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
{
struct xfrm6_tunnel *handler;
- for (handler = tunnel6_handlers; handler; handler = handler->next)
+ for_each_tunnel_rcu(tunnel6_handlers, handler)
if (!handler->err_handler(skb, opt, type, code, offset, info))
break;
}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 1dd1affdead..c84dad43211 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -111,10 +111,19 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr);
}
+static void udp_v6_rehash(struct sock *sk)
+{
+ u16 new_hash = udp6_portaddr_hash(sock_net(sk),
+ &inet6_sk(sk)->rcv_saddr,
+ inet_sk(sk)->inet_num);
+
+ udp_lib_rehash(sk, new_hash);
+}
+
static inline int compute_score(struct sock *sk, struct net *net,
unsigned short hnum,
- struct in6_addr *saddr, __be16 sport,
- struct in6_addr *daddr, __be16 dport,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr, __be16 dport,
int dif)
{
int score = -1;
@@ -230,8 +239,8 @@ exact_match:
}
static struct sock *__udp6_lib_lookup(struct net *net,
- struct in6_addr *saddr, __be16 sport,
- struct in6_addr *daddr, __be16 dport,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr, __be16 dport,
int dif, struct udp_table *udptable)
{
struct sock *sk, *result;
@@ -311,6 +320,14 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
udptable);
}
+struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr, __be16 dport, int dif)
+{
+ return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
+}
+EXPORT_SYMBOL_GPL(udp6_lib_lookup);
+
+
/*
* This should be easy, if there is something there we
* return it, otherwise we block.
@@ -1447,6 +1464,7 @@ struct proto udpv6_prot = {
.backlog_rcv = udpv6_queue_rcv_skb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
+ .rehash = udp_v6_rehash,
.get_port = udp_v6_get_port,
.memory_allocated = &udp_memory_allocated,
.sysctl_mem = sysctl_udp_mem,
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 6baeabbbca8..7e74023ea6e 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -199,7 +199,7 @@ static inline int xfrm6_garbage_collect(struct dst_ops *ops)
struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops);
xfrm6_policy_afinfo.garbage_collect(net);
- return (atomic_read(&ops->entries) > ops->gc_thresh * 2);
+ return dst_entries_get_fast(ops) > ops->gc_thresh * 2;
}
static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -255,7 +255,6 @@ static struct dst_ops xfrm6_dst_ops = {
.ifdown = xfrm6_dst_ifdown,
.local_out = __ip6_local_out,
.gc_thresh = 1024,
- .entries = ATOMIC_INIT(0),
};
static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
@@ -312,11 +311,13 @@ int __init xfrm6_init(void)
*/
gc_thresh = FIB6_TABLE_HASHSZ * 8;
xfrm6_dst_ops.gc_thresh = (gc_thresh < 1024) ? 1024 : gc_thresh;
+ dst_entries_init(&xfrm6_dst_ops);
ret = xfrm6_policy_init();
- if (ret)
+ if (ret) {
+ dst_entries_destroy(&xfrm6_dst_ops);
goto out;
-
+ }
ret = xfrm6_state_init();
if (ret)
goto out_policy;
@@ -341,4 +342,5 @@ void xfrm6_fini(void)
//xfrm6_input_fini();
xfrm6_policy_fini();
xfrm6_state_fini();
+ dst_entries_destroy(&xfrm6_dst_ops);
}
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index f417b77fa0e..a67575d472a 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -20,23 +20,27 @@
#include <net/addrconf.h>
static void
-__xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl,
- struct xfrm_tmpl *tmpl,
- xfrm_address_t *daddr, xfrm_address_t *saddr)
+__xfrm6_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
{
/* Initialize temporary selector matching only
* to current session. */
- ipv6_addr_copy((struct in6_addr *)&x->sel.daddr, &fl->fl6_dst);
- ipv6_addr_copy((struct in6_addr *)&x->sel.saddr, &fl->fl6_src);
- x->sel.dport = xfrm_flowi_dport(fl);
- x->sel.dport_mask = htons(0xffff);
- x->sel.sport = xfrm_flowi_sport(fl);
- x->sel.sport_mask = htons(0xffff);
- x->sel.family = AF_INET6;
- x->sel.prefixlen_d = 128;
- x->sel.prefixlen_s = 128;
- x->sel.proto = fl->proto;
- x->sel.ifindex = fl->oif;
+ ipv6_addr_copy((struct in6_addr *)&sel->daddr, &fl->fl6_dst);
+ ipv6_addr_copy((struct in6_addr *)&sel->saddr, &fl->fl6_src);
+ sel->dport = xfrm_flowi_dport(fl);
+ sel->dport_mask = htons(0xffff);
+ sel->sport = xfrm_flowi_sport(fl);
+ sel->sport_mask = htons(0xffff);
+ sel->family = AF_INET6;
+ sel->prefixlen_d = 128;
+ sel->prefixlen_s = 128;
+ sel->proto = fl->proto;
+ sel->ifindex = fl->oif;
+}
+
+static void
+xfrm6_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
+ xfrm_address_t *daddr, xfrm_address_t *saddr)
+{
x->id = tmpl->id;
if (ipv6_addr_any((struct in6_addr*)&x->id.daddr))
memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr));
@@ -168,6 +172,7 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = {
.eth_proto = htons(ETH_P_IPV6),
.owner = THIS_MODULE,
.init_tempsel = __xfrm6_init_tempsel,
+ .init_temprop = xfrm6_init_temprop,
.tmpl_sort = __xfrm6_tmpl_sort,
.state_sort = __xfrm6_state_sort,
.output = xfrm6_output,
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 2ce3a8278f2..2969cad408d 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -199,7 +199,7 @@ static void x6spi_destroy_rcu(struct rcu_head *head)
container_of(head, struct xfrm6_tunnel_spi, rcu_head));
}
-void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr)
+static void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr)
{
struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
struct xfrm6_tunnel_spi *x6spi;
@@ -223,8 +223,6 @@ void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr)
spin_unlock_bh(&xfrm6_tunnel_spi_lock);
}
-EXPORT_SYMBOL(xfrm6_tunnel_free_spi);
-
static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
{
skb_push(skb, -skb_network_offset(skb));
@@ -317,13 +315,13 @@ static const struct xfrm_type xfrm6_tunnel_type = {
.output = xfrm6_tunnel_output,
};
-static struct xfrm6_tunnel xfrm6_tunnel_handler = {
+static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = {
.handler = xfrm6_tunnel_rcv,
.err_handler = xfrm6_tunnel_err,
.priority = 2,
};
-static struct xfrm6_tunnel xfrm46_tunnel_handler = {
+static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = {
.handler = xfrm6_tunnel_rcv,
.err_handler = xfrm6_tunnel_err,
.priority = 2,
diff --git a/net/ipx/Kconfig b/net/ipx/Kconfig
index e9ad0062fbb..02549cb2c32 100644
--- a/net/ipx/Kconfig
+++ b/net/ipx/Kconfig
@@ -3,6 +3,7 @@
#
config IPX
tristate "The IPX protocol"
+ depends on BKL # should be fixable
select LLC
---help---
This is support for the Novell networking protocol, IPX, commonly
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 79986a674f6..7f097989cde 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -573,9 +573,9 @@ static int irda_find_lsap_sel(struct irda_sock *self, char *name)
/* Requested object/attribute doesn't exist */
if((self->errno == IAS_CLASS_UNKNOWN) ||
(self->errno == IAS_ATTRIB_UNKNOWN))
- return (-EADDRNOTAVAIL);
+ return -EADDRNOTAVAIL;
else
- return (-EHOSTUNREACH);
+ return -EHOSTUNREACH;
}
/* Get the remote TSAP selector */
@@ -663,7 +663,7 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name)
__func__, name);
self->daddr = DEV_ADDR_ANY;
kfree(discoveries);
- return(-ENOTUNIQ);
+ return -ENOTUNIQ;
}
/* First time we found that one, save it ! */
daddr = self->daddr;
@@ -677,7 +677,7 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name)
IRDA_DEBUG(0, "%s(), unexpected IAS query failure\n", __func__);
self->daddr = DEV_ADDR_ANY;
kfree(discoveries);
- return(-EHOSTUNREACH);
+ return -EHOSTUNREACH;
break;
}
}
@@ -689,7 +689,7 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name)
IRDA_DEBUG(1, "%s(), cannot discover service ''%s'' in any device !!!\n",
__func__, name);
self->daddr = DEV_ADDR_ANY;
- return(-EADDRNOTAVAIL);
+ return -EADDRNOTAVAIL;
}
/* Revert back to discovered device & service */
@@ -715,14 +715,11 @@ static int irda_getname(struct socket *sock, struct sockaddr *uaddr,
struct sockaddr_irda saddr;
struct sock *sk = sock->sk;
struct irda_sock *self = irda_sk(sk);
- int err;
- lock_kernel();
memset(&saddr, 0, sizeof(saddr));
if (peer) {
- err = -ENOTCONN;
if (sk->sk_state != TCP_ESTABLISHED)
- goto out;
+ return -ENOTCONN;
saddr.sir_family = AF_IRDA;
saddr.sir_lsap_sel = self->dtsap_sel;
@@ -739,10 +736,8 @@ static int irda_getname(struct socket *sock, struct sockaddr *uaddr,
/* uaddr_len come to us uninitialised */
*uaddr_len = sizeof (struct sockaddr_irda);
memcpy(uaddr, &saddr, *uaddr_len);
- err = 0;
-out:
- unlock_kernel();
- return err;
+
+ return 0;
}
/*
@@ -758,7 +753,8 @@ static int irda_listen(struct socket *sock, int backlog)
IRDA_DEBUG(2, "%s()\n", __func__);
- lock_kernel();
+ lock_sock(sk);
+
if ((sk->sk_type != SOCK_STREAM) && (sk->sk_type != SOCK_SEQPACKET) &&
(sk->sk_type != SOCK_DGRAM))
goto out;
@@ -770,7 +766,7 @@ static int irda_listen(struct socket *sock, int backlog)
err = 0;
}
out:
- unlock_kernel();
+ release_sock(sk);
return err;
}
@@ -793,7 +789,7 @@ static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (addr_len != sizeof(struct sockaddr_irda))
return -EINVAL;
- lock_kernel();
+ lock_sock(sk);
#ifdef CONFIG_IRDA_ULTRA
/* Special care for Ultra sockets */
if ((sk->sk_type == SOCK_DGRAM) &&
@@ -824,8 +820,8 @@ static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
err = irda_open_tsap(self, addr->sir_lsap_sel, addr->sir_name);
if (err < 0) {
- kfree(self->ias_obj->name);
- kfree(self->ias_obj);
+ irias_delete_object(self->ias_obj);
+ self->ias_obj = NULL;
goto out;
}
@@ -836,7 +832,7 @@ static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
err = 0;
out:
- unlock_kernel();
+ release_sock(sk);
return err;
}
@@ -856,12 +852,13 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
IRDA_DEBUG(2, "%s()\n", __func__);
- lock_kernel();
err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0);
if (err)
- goto out;
+ return err;
err = -EINVAL;
+
+ lock_sock(sk);
if (sock->state != SS_UNCONNECTED)
goto out;
@@ -947,7 +944,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
irda_connect_response(new);
err = 0;
out:
- unlock_kernel();
+ release_sock(sk);
return err;
}
@@ -981,7 +978,7 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr,
IRDA_DEBUG(2, "%s(%p)\n", __func__, self);
- lock_kernel();
+ lock_sock(sk);
/* Don't allow connect for Ultra sockets */
err = -ESOCKTNOSUPPORT;
if ((sk->sk_type == SOCK_DGRAM) && (sk->sk_protocol == IRDAPROTO_ULTRA))
@@ -1072,6 +1069,8 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr,
if (sk->sk_state != TCP_ESTABLISHED) {
sock->state = SS_UNCONNECTED;
+ if (sk->sk_prot->disconnect(sk, flags))
+ sock->state = SS_DISCONNECTING;
err = sock_error(sk);
if (!err)
err = -ECONNRESET;
@@ -1084,7 +1083,7 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr,
self->saddr = irttp_get_saddr(self->tsap);
err = 0;
out:
- unlock_kernel();
+ release_sock(sk);
return err;
}
@@ -1231,7 +1230,6 @@ static int irda_release(struct socket *sock)
if (sk == NULL)
return 0;
- lock_kernel();
lock_sock(sk);
sk->sk_state = TCP_CLOSE;
sk->sk_shutdown |= SEND_SHUTDOWN;
@@ -1250,7 +1248,6 @@ static int irda_release(struct socket *sock)
/* Destroy networking socket if we are the last reference on it,
* i.e. if(sk->sk_refcnt == 0) -> sk_free(sk) */
sock_put(sk);
- unlock_kernel();
/* Notes on socket locking and deallocation... - Jean II
* In theory we should put pairs of sock_hold() / sock_put() to
@@ -1298,7 +1295,6 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock,
IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len);
- lock_kernel();
/* Note : socket.c set MSG_EOR on SEQPACKET sockets */
if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_EOR | MSG_CMSG_COMPAT |
MSG_NOSIGNAL)) {
@@ -1306,6 +1302,8 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock,
goto out;
}
+ lock_sock(sk);
+
if (sk->sk_shutdown & SEND_SHUTDOWN)
goto out_err;
@@ -1361,14 +1359,14 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock,
goto out_err;
}
- unlock_kernel();
+ release_sock(sk);
/* Tell client how much data we actually sent */
return len;
out_err:
err = sk_stream_error(sk, msg->msg_flags, err);
out:
- unlock_kernel();
+ release_sock(sk);
return err;
}
@@ -1390,14 +1388,10 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock,
IRDA_DEBUG(4, "%s()\n", __func__);
- lock_kernel();
- if ((err = sock_error(sk)) < 0)
- goto out;
-
skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
flags & MSG_DONTWAIT, &err);
if (!skb)
- goto out;
+ return err;
skb_reset_transport_header(skb);
copied = skb->len;
@@ -1425,12 +1419,8 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock,
irttp_flow_request(self->tsap, FLOW_START);
}
}
- unlock_kernel();
- return copied;
-out:
- unlock_kernel();
- return err;
+ return copied;
}
/*
@@ -1448,17 +1438,15 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock,
IRDA_DEBUG(3, "%s()\n", __func__);
- lock_kernel();
if ((err = sock_error(sk)) < 0)
- goto out;
+ return err;
- err = -EINVAL;
if (sock->flags & __SO_ACCEPTCON)
- goto out;
+ return -EINVAL;
err =-EOPNOTSUPP;
if (flags & MSG_OOB)
- goto out;
+ return -EOPNOTSUPP;
err = 0;
target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
@@ -1500,7 +1488,7 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock,
finish_wait(sk_sleep(sk), &wait);
if (err)
- goto out;
+ return err;
if (sk->sk_shutdown & RCV_SHUTDOWN)
break;
@@ -1553,9 +1541,7 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock,
}
}
-out:
- unlock_kernel();
- return err ? : copied;
+ return copied;
}
/*
@@ -1573,13 +1559,12 @@ static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock,
struct sk_buff *skb;
int err;
- lock_kernel();
-
IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len);
- err = -EINVAL;
if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT))
- goto out;
+ return -EINVAL;
+
+ lock_sock(sk);
if (sk->sk_shutdown & SEND_SHUTDOWN) {
send_sig(SIGPIPE, current, 0);
@@ -1630,10 +1615,12 @@ static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock,
IRDA_DEBUG(0, "%s(), err=%d\n", __func__, err);
goto out;
}
- unlock_kernel();
+
+ release_sock(sk);
return len;
+
out:
- unlock_kernel();
+ release_sock(sk);
return err;
}
@@ -1656,10 +1643,11 @@ static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock,
IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len);
- lock_kernel();
err = -EINVAL;
if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT))
- goto out;
+ return -EINVAL;
+
+ lock_sock(sk);
err = -EPIPE;
if (sk->sk_shutdown & SEND_SHUTDOWN) {
@@ -1732,7 +1720,7 @@ static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock,
if (err)
IRDA_DEBUG(0, "%s(), err=%d\n", __func__, err);
out:
- unlock_kernel();
+ release_sock(sk);
return err ? : len;
}
#endif /* CONFIG_IRDA_ULTRA */
@@ -1747,7 +1735,7 @@ static int irda_shutdown(struct socket *sock, int how)
IRDA_DEBUG(1, "%s(%p)\n", __func__, self);
- lock_kernel();
+ lock_sock(sk);
sk->sk_state = TCP_CLOSE;
sk->sk_shutdown |= SEND_SHUTDOWN;
@@ -1769,7 +1757,7 @@ static int irda_shutdown(struct socket *sock, int how)
self->daddr = DEV_ADDR_ANY; /* Until we get re-connected */
self->saddr = 0x0; /* so IrLMP assign us any link */
- unlock_kernel();
+ release_sock(sk);
return 0;
}
@@ -1786,7 +1774,6 @@ static unsigned int irda_poll(struct file * file, struct socket *sock,
IRDA_DEBUG(4, "%s()\n", __func__);
- lock_kernel();
poll_wait(file, sk_sleep(sk), wait);
mask = 0;
@@ -1834,20 +1821,8 @@ static unsigned int irda_poll(struct file * file, struct socket *sock,
default:
break;
}
- unlock_kernel();
- return mask;
-}
-
-static unsigned int irda_datagram_poll(struct file *file, struct socket *sock,
- poll_table *wait)
-{
- int err;
-
- lock_kernel();
- err = datagram_poll(file, sock, wait);
- unlock_kernel();
- return err;
+ return mask;
}
/*
@@ -1860,7 +1835,6 @@ static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
IRDA_DEBUG(4, "%s(), cmd=%#x\n", __func__, cmd);
- lock_kernel();
err = -EINVAL;
switch (cmd) {
case TIOCOUTQ: {
@@ -1903,7 +1877,6 @@ static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __func__);
err = -ENOIOCTLCMD;
}
- unlock_kernel();
return err;
}
@@ -1927,7 +1900,7 @@ static int irda_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon
* Set some options for the socket
*
*/
-static int __irda_setsockopt(struct socket *sock, int level, int optname,
+static int irda_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
@@ -1935,13 +1908,15 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
struct irda_ias_set *ias_opt;
struct ias_object *ias_obj;
struct ias_attrib * ias_attr; /* Attribute in IAS object */
- int opt, free_ias = 0;
+ int opt, free_ias = 0, err = 0;
IRDA_DEBUG(2, "%s(%p)\n", __func__, self);
if (level != SOL_IRLMP)
return -ENOPROTOOPT;
+ lock_sock(sk);
+
switch (optname) {
case IRLMP_IAS_SET:
/* The user want to add an attribute to an existing IAS object
@@ -1951,17 +1926,22 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
* create the right attribute...
*/
- if (optlen != sizeof(struct irda_ias_set))
- return -EINVAL;
+ if (optlen != sizeof(struct irda_ias_set)) {
+ err = -EINVAL;
+ goto out;
+ }
ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
- if (ias_opt == NULL)
- return -ENOMEM;
+ if (ias_opt == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
/* Copy query to the driver. */
if (copy_from_user(ias_opt, optval, optlen)) {
kfree(ias_opt);
- return -EFAULT;
+ err = -EFAULT;
+ goto out;
}
/* Find the object we target.
@@ -1971,7 +1951,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
if(ias_opt->irda_class_name[0] == '\0') {
if(self->ias_obj == NULL) {
kfree(ias_opt);
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
ias_obj = self->ias_obj;
} else
@@ -1983,7 +1964,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
if((!capable(CAP_NET_ADMIN)) &&
((ias_obj == NULL) || (ias_obj != self->ias_obj))) {
kfree(ias_opt);
- return -EPERM;
+ err = -EPERM;
+ goto out;
}
/* If the object doesn't exist, create it */
@@ -1993,7 +1975,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
jiffies);
if (ias_obj == NULL) {
kfree(ias_opt);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto out;
}
free_ias = 1;
}
@@ -2005,7 +1988,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
kfree(ias_obj->name);
kfree(ias_obj);
}
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
/* Look at the type */
@@ -2028,7 +2012,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
kfree(ias_obj);
}
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
/* Add an octet sequence attribute */
irias_add_octseq_attrib(
@@ -2060,7 +2045,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
kfree(ias_obj->name);
kfree(ias_obj);
}
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
irias_insert_object(ias_obj);
kfree(ias_opt);
@@ -2071,17 +2057,22 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
* object is not owned by the kernel and delete it.
*/
- if (optlen != sizeof(struct irda_ias_set))
- return -EINVAL;
+ if (optlen != sizeof(struct irda_ias_set)) {
+ err = -EINVAL;
+ goto out;
+ }
ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
- if (ias_opt == NULL)
- return -ENOMEM;
+ if (ias_opt == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
/* Copy query to the driver. */
if (copy_from_user(ias_opt, optval, optlen)) {
kfree(ias_opt);
- return -EFAULT;
+ err = -EFAULT;
+ goto out;
}
/* Find the object we target.
@@ -2094,7 +2085,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
ias_obj = irias_find_object(ias_opt->irda_class_name);
if(ias_obj == (struct ias_object *) NULL) {
kfree(ias_opt);
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
/* Only ROOT can mess with the global IAS database.
@@ -2103,7 +2095,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
if((!capable(CAP_NET_ADMIN)) &&
((ias_obj == NULL) || (ias_obj != self->ias_obj))) {
kfree(ias_opt);
- return -EPERM;
+ err = -EPERM;
+ goto out;
}
/* Find the attribute (in the object) we target */
@@ -2111,14 +2104,16 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
ias_opt->irda_attrib_name);
if(ias_attr == (struct ias_attrib *) NULL) {
kfree(ias_opt);
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
/* Check is the user space own the object */
if(ias_attr->value->owner != IAS_USER_ATTR) {
IRDA_DEBUG(1, "%s(), attempting to delete a kernel attribute\n", __func__);
kfree(ias_opt);
- return -EPERM;
+ err = -EPERM;
+ goto out;
}
/* Remove the attribute (and maybe the object) */
@@ -2126,11 +2121,15 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
kfree(ias_opt);
break;
case IRLMP_MAX_SDU_SIZE:
- if (optlen < sizeof(int))
- return -EINVAL;
+ if (optlen < sizeof(int)) {
+ err = -EINVAL;
+ goto out;
+ }
- if (get_user(opt, (int __user *)optval))
- return -EFAULT;
+ if (get_user(opt, (int __user *)optval)) {
+ err = -EFAULT;
+ goto out;
+ }
/* Only possible for a seqpacket service (TTP with SAR) */
if (sk->sk_type != SOCK_SEQPACKET) {
@@ -2140,16 +2139,21 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
} else {
IRDA_WARNING("%s: not allowed to set MAXSDUSIZE for this socket type!\n",
__func__);
- return -ENOPROTOOPT;
+ err = -ENOPROTOOPT;
+ goto out;
}
break;
case IRLMP_HINTS_SET:
- if (optlen < sizeof(int))
- return -EINVAL;
+ if (optlen < sizeof(int)) {
+ err = -EINVAL;
+ goto out;
+ }
/* The input is really a (__u8 hints[2]), easier as an int */
- if (get_user(opt, (int __user *)optval))
- return -EFAULT;
+ if (get_user(opt, (int __user *)optval)) {
+ err = -EFAULT;
+ goto out;
+ }
/* Unregister any old registration */
if (self->skey)
@@ -2163,12 +2167,16 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
* making a discovery (nodes which don't match any hint
* bit in the mask are not reported).
*/
- if (optlen < sizeof(int))
- return -EINVAL;
+ if (optlen < sizeof(int)) {
+ err = -EINVAL;
+ goto out;
+ }
/* The input is really a (__u8 hints[2]), easier as an int */
- if (get_user(opt, (int __user *)optval))
- return -EFAULT;
+ if (get_user(opt, (int __user *)optval)) {
+ err = -EFAULT;
+ goto out;
+ }
/* Set the new hint mask */
self->mask.word = (__u16) opt;
@@ -2180,19 +2188,12 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
break;
default:
- return -ENOPROTOOPT;
+ err = -ENOPROTOOPT;
+ break;
}
- return 0;
-}
-
-static int irda_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- int err;
- lock_kernel();
- err = __irda_setsockopt(sock, level, optname, optval, optlen);
- unlock_kernel();
+out:
+ release_sock(sk);
return err;
}
@@ -2249,7 +2250,7 @@ static int irda_extract_ias_value(struct irda_ias_set *ias_opt,
/*
* Function irda_getsockopt (sock, level, optname, optval, optlen)
*/
-static int __irda_getsockopt(struct socket *sock, int level, int optname,
+static int irda_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
@@ -2262,7 +2263,7 @@ static int __irda_getsockopt(struct socket *sock, int level, int optname,
int daddr = DEV_ADDR_ANY; /* Dest address for IAS queries */
int val = 0;
int len = 0;
- int err;
+ int err = 0;
int offset, total;
IRDA_DEBUG(2, "%s(%p)\n", __func__, self);
@@ -2276,15 +2277,18 @@ static int __irda_getsockopt(struct socket *sock, int level, int optname,
if(len < 0)
return -EINVAL;
+ lock_sock(sk);
+
switch (optname) {
case IRLMP_ENUMDEVICES:
/* Ask lmp for the current discovery log */
discoveries = irlmp_get_discoveries(&list.len, self->mask.word,
self->nslots);
/* Check if the we got some results */
- if (discoveries == NULL)
- return -EAGAIN; /* Didn't find any devices */
- err = 0;
+ if (discoveries == NULL) {
+ err = -EAGAIN;
+ goto out; /* Didn't find any devices */
+ }
/* Write total list length back to client */
if (copy_to_user(optval, &list,
@@ -2297,8 +2301,7 @@ static int __irda_getsockopt(struct socket *sock, int level, int optname,
sizeof(struct irda_device_info);
/* Copy the list itself - watch for overflow */
- if(list.len > 2048)
- {
+ if (list.len > 2048) {
err = -EINVAL;
goto bed;
}
@@ -2314,17 +2317,20 @@ static int __irda_getsockopt(struct socket *sock, int level, int optname,
bed:
/* Free up our buffer */
kfree(discoveries);
- if (err)
- return err;
break;
case IRLMP_MAX_SDU_SIZE:
val = self->max_data_size;
len = sizeof(int);
- if (put_user(len, optlen))
- return -EFAULT;
+ if (put_user(len, optlen)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ if (copy_to_user(optval, &val, len)) {
+ err = -EFAULT;
+ goto out;
+ }
- if (copy_to_user(optval, &val, len))
- return -EFAULT;
break;
case IRLMP_IAS_GET:
/* The user want an object from our local IAS database.
@@ -2332,17 +2338,22 @@ bed:
* that we found */
/* Check that the user has allocated the right space for us */
- if (len != sizeof(struct irda_ias_set))
- return -EINVAL;
+ if (len != sizeof(struct irda_ias_set)) {
+ err = -EINVAL;
+ goto out;
+ }
ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
- if (ias_opt == NULL)
- return -ENOMEM;
+ if (ias_opt == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
/* Copy query to the driver. */
if (copy_from_user(ias_opt, optval, len)) {
kfree(ias_opt);
- return -EFAULT;
+ err = -EFAULT;
+ goto out;
}
/* Find the object we target.
@@ -2355,7 +2366,8 @@ bed:
ias_obj = irias_find_object(ias_opt->irda_class_name);
if(ias_obj == (struct ias_object *) NULL) {
kfree(ias_opt);
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
/* Find the attribute (in the object) we target */
@@ -2363,21 +2375,23 @@ bed:
ias_opt->irda_attrib_name);
if(ias_attr == (struct ias_attrib *) NULL) {
kfree(ias_opt);
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
/* Translate from internal to user structure */
err = irda_extract_ias_value(ias_opt, ias_attr->value);
if(err) {
kfree(ias_opt);
- return err;
+ goto out;
}
/* Copy reply to the user */
if (copy_to_user(optval, ias_opt,
sizeof(struct irda_ias_set))) {
kfree(ias_opt);
- return -EFAULT;
+ err = -EFAULT;
+ goto out;
}
/* Note : don't need to put optlen, we checked it */
kfree(ias_opt);
@@ -2388,17 +2402,22 @@ bed:
* then wait for the answer to come back. */
/* Check that the user has allocated the right space for us */
- if (len != sizeof(struct irda_ias_set))
- return -EINVAL;
+ if (len != sizeof(struct irda_ias_set)) {
+ err = -EINVAL;
+ goto out;
+ }
ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
- if (ias_opt == NULL)
- return -ENOMEM;
+ if (ias_opt == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
/* Copy query to the driver. */
if (copy_from_user(ias_opt, optval, len)) {
kfree(ias_opt);
- return -EFAULT;
+ err = -EFAULT;
+ goto out;
}
/* At this point, there are two cases...
@@ -2419,7 +2438,8 @@ bed:
daddr = ias_opt->daddr;
if((!daddr) || (daddr == DEV_ADDR_ANY)) {
kfree(ias_opt);
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
}
@@ -2428,7 +2448,8 @@ bed:
IRDA_WARNING("%s: busy with a previous query\n",
__func__);
kfree(ias_opt);
- return -EBUSY;
+ err = -EBUSY;
+ goto out;
}
self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
@@ -2436,7 +2457,8 @@ bed:
if (self->iriap == NULL) {
kfree(ias_opt);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto out;
}
/* Treat unexpected wakeup as disconnect */
@@ -2455,7 +2477,8 @@ bed:
* we can free it regardless! */
kfree(ias_opt);
/* Treat signals as disconnect */
- return -EHOSTUNREACH;
+ err = -EHOSTUNREACH;
+ goto out;
}
/* Check what happened */
@@ -2465,9 +2488,11 @@ bed:
/* Requested object/attribute doesn't exist */
if((self->errno == IAS_CLASS_UNKNOWN) ||
(self->errno == IAS_ATTRIB_UNKNOWN))
- return (-EADDRNOTAVAIL);
+ err = -EADDRNOTAVAIL;
else
- return (-EHOSTUNREACH);
+ err = -EHOSTUNREACH;
+
+ goto out;
}
/* Translate from internal to user structure */
@@ -2476,14 +2501,15 @@ bed:
irias_delete_value(self->ias_result);
if (err) {
kfree(ias_opt);
- return err;
+ goto out;
}
/* Copy reply to the user */
if (copy_to_user(optval, ias_opt,
sizeof(struct irda_ias_set))) {
kfree(ias_opt);
- return -EFAULT;
+ err = -EFAULT;
+ goto out;
}
/* Note : don't need to put optlen, we checked it */
kfree(ias_opt);
@@ -2504,11 +2530,15 @@ bed:
*/
/* Check that the user is passing us an int */
- if (len != sizeof(int))
- return -EINVAL;
+ if (len != sizeof(int)) {
+ err = -EINVAL;
+ goto out;
+ }
/* Get timeout in ms (max time we block the caller) */
- if (get_user(val, (int __user *)optval))
- return -EFAULT;
+ if (get_user(val, (int __user *)optval)) {
+ err = -EFAULT;
+ goto out;
+ }
/* Tell IrLMP we want to be notified */
irlmp_update_client(self->ckey, self->mask.word,
@@ -2520,8 +2550,6 @@ bed:
/* Wait until a node is discovered */
if (!self->cachedaddr) {
- int ret = 0;
-
IRDA_DEBUG(1, "%s(), nothing discovered yet, going to sleep...\n", __func__);
/* Set watchdog timer to expire in <val> ms. */
@@ -2534,7 +2562,7 @@ bed:
/* Wait for IR-LMP to call us back */
__wait_event_interruptible(self->query_wait,
(self->cachedaddr != 0 || self->errno == -ETIME),
- ret);
+ err);
/* If watchdog is still activated, kill it! */
if(timer_pending(&(self->watchdog)))
@@ -2542,8 +2570,8 @@ bed:
IRDA_DEBUG(1, "%s(), ...waking up !\n", __func__);
- if (ret != 0)
- return ret;
+ if (err != 0)
+ goto out;
}
else
IRDA_DEBUG(1, "%s(), found immediately !\n",
@@ -2566,25 +2594,19 @@ bed:
* If the user want more details, he should query
* the whole discovery log and pick one device...
*/
- if (put_user(daddr, (int __user *)optval))
- return -EFAULT;
+ if (put_user(daddr, (int __user *)optval)) {
+ err = -EFAULT;
+ goto out;
+ }
break;
default:
- return -ENOPROTOOPT;
+ err = -ENOPROTOOPT;
}
- return 0;
-}
-
-static int irda_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- int err;
+out:
- lock_kernel();
- err = __irda_getsockopt(sock, level, optname, optval, optlen);
- unlock_kernel();
+ release_sock(sk);
return err;
}
@@ -2628,7 +2650,7 @@ static const struct proto_ops irda_seqpacket_ops = {
.socketpair = sock_no_socketpair,
.accept = irda_accept,
.getname = irda_getname,
- .poll = irda_datagram_poll,
+ .poll = datagram_poll,
.ioctl = irda_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = irda_compat_ioctl,
@@ -2652,7 +2674,7 @@ static const struct proto_ops irda_dgram_ops = {
.socketpair = sock_no_socketpair,
.accept = irda_accept,
.getname = irda_getname,
- .poll = irda_datagram_poll,
+ .poll = datagram_poll,
.ioctl = irda_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = irda_compat_ioctl,
@@ -2677,7 +2699,7 @@ static const struct proto_ops irda_ultra_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = irda_getname,
- .poll = irda_datagram_poll,
+ .poll = datagram_poll,
.ioctl = irda_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = irda_compat_ioctl,
diff --git a/net/irda/discovery.c b/net/irda/discovery.c
index c1c8ae93912..36c3f037f17 100644
--- a/net/irda/discovery.c
+++ b/net/irda/discovery.c
@@ -315,7 +315,7 @@ struct irda_device_info *irlmp_copy_discoveries(hashbin_t *log, int *pn,
/* Get the actual number of device in the buffer and return */
*pn = i;
- return(buffer);
+ return buffer;
}
#ifdef CONFIG_PROC_FS
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index faa82ca2dfd..a39cca8331d 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -449,8 +449,8 @@ static int ircomm_tty_open(struct tty_struct *tty, struct file *filp)
}
#ifdef SERIAL_DO_RESTART
- return ((self->flags & ASYNC_HUP_NOTIFY) ?
- -EAGAIN : -ERESTARTSYS);
+ return (self->flags & ASYNC_HUP_NOTIFY) ?
+ -EAGAIN : -ERESTARTSYS;
#else
return -EAGAIN;
#endif
diff --git a/net/irda/iriap.c b/net/irda/iriap.c
index fce364c6c71..5b743bdd89b 100644
--- a/net/irda/iriap.c
+++ b/net/irda/iriap.c
@@ -502,7 +502,8 @@ static void iriap_getvaluebyclass_confirm(struct iriap_cb *self,
IRDA_DEBUG(4, "%s(), strlen=%d\n", __func__, value_len);
/* Make sure the string is null-terminated */
- fp[n+value_len] = 0x00;
+ if (n + value_len < skb->len)
+ fp[n + value_len] = 0x00;
IRDA_DEBUG(4, "Got string %s\n", fp+n);
/* Will truncate to IAS_MAX_STRING bytes */
diff --git a/net/irda/irlan/irlan_common.c b/net/irda/irlan/irlan_common.c
index a788f9e9427..6130f9d9dbe 100644
--- a/net/irda/irlan/irlan_common.c
+++ b/net/irda/irlan/irlan_common.c
@@ -1102,7 +1102,7 @@ int irlan_extract_param(__u8 *buf, char *name, char *value, __u16 *len)
memcpy(&val_len, buf+n, 2); /* To avoid alignment problems */
le16_to_cpus(&val_len); n+=2;
- if (val_len > 1016) {
+ if (val_len >= 1016) {
IRDA_DEBUG(2, "%s(), parameter length to long\n", __func__ );
return -RSP_INVALID_COMMAND_FORMAT;
}
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
index 9616c32d107..8ee1ff6c742 100644
--- a/net/irda/irlan/irlan_eth.c
+++ b/net/irda/irlan/irlan_eth.c
@@ -45,13 +45,11 @@ static int irlan_eth_close(struct net_device *dev);
static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
struct net_device *dev);
static void irlan_eth_set_multicast_list( struct net_device *dev);
-static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev);
static const struct net_device_ops irlan_eth_netdev_ops = {
.ndo_open = irlan_eth_open,
.ndo_stop = irlan_eth_close,
.ndo_start_xmit = irlan_eth_xmit,
- .ndo_get_stats = irlan_eth_get_stats,
.ndo_set_multicast_list = irlan_eth_set_multicast_list,
.ndo_change_mtu = eth_change_mtu,
.ndo_validate_addr = eth_validate_addr,
@@ -169,6 +167,7 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
{
struct irlan_cb *self = netdev_priv(dev);
int ret;
+ unsigned int len;
/* skb headroom large enough to contain all IrDA-headers? */
if ((skb_headroom(skb) < self->max_header_size) || (skb_shared(skb))) {
@@ -188,6 +187,7 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
dev->trans_start = jiffies;
+ len = skb->len;
/* Now queue the packet in the transport layer */
if (self->use_udata)
ret = irttp_udata_request(self->tsap_data, skb);
@@ -206,10 +206,10 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
* tried :-) DB
*/
/* irttp_data_request already free the packet */
- self->stats.tx_dropped++;
+ dev->stats.tx_dropped++;
} else {
- self->stats.tx_packets++;
- self->stats.tx_bytes += skb->len;
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += len;
}
return NETDEV_TX_OK;
@@ -224,15 +224,16 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
int irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb)
{
struct irlan_cb *self = instance;
+ struct net_device *dev = self->dev;
if (skb == NULL) {
- ++self->stats.rx_dropped;
+ dev->stats.rx_dropped++;
return 0;
}
if (skb->len < ETH_HLEN) {
IRDA_DEBUG(0, "%s() : IrLAN frame too short (%d)\n",
__func__, skb->len);
- ++self->stats.rx_dropped;
+ dev->stats.rx_dropped++;
dev_kfree_skb(skb);
return 0;
}
@@ -242,10 +243,10 @@ int irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb)
* might have been previously set by the low level IrDA network
* device driver
*/
- skb->protocol = eth_type_trans(skb, self->dev); /* Remove eth header */
+ skb->protocol = eth_type_trans(skb, dev); /* Remove eth header */
- self->stats.rx_packets++;
- self->stats.rx_bytes += skb->len;
+ dev->stats.rx_packets++;
+ dev->stats.rx_bytes += skb->len;
netif_rx(skb); /* Eat it! */
@@ -346,16 +347,3 @@ static void irlan_eth_set_multicast_list(struct net_device *dev)
else
irlan_set_broadcast_filter(self, FALSE);
}
-
-/*
- * Function irlan_get_stats (dev)
- *
- * Get the current statistics for this device
- *
- */
-static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev)
-{
- struct irlan_cb *self = netdev_priv(dev);
-
- return &self->stats;
-}
diff --git a/net/irda/irlan/irlan_event.c b/net/irda/irlan/irlan_event.c
index cbcb4eb5403..43f16040a6f 100644
--- a/net/irda/irlan/irlan_event.c
+++ b/net/irda/irlan/irlan_event.c
@@ -24,7 +24,7 @@
#include <net/irda/irlan_event.h>
-char *irlan_state[] = {
+const char * const irlan_state[] = {
"IRLAN_IDLE",
"IRLAN_QUERY",
"IRLAN_CONN",
diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c
index 0e7d8bde145..6115a44c0a2 100644
--- a/net/irda/irlmp.c
+++ b/net/irda/irlmp.c
@@ -939,7 +939,7 @@ struct irda_device_info *irlmp_get_discoveries(int *pn, __u16 mask, int nslots)
}
/* Return current cached discovery log */
- return(irlmp_copy_discoveries(irlmp->cachelog, pn, mask, TRUE));
+ return irlmp_copy_discoveries(irlmp->cachelog, pn, mask, TRUE);
}
EXPORT_SYMBOL(irlmp_get_discoveries);
diff --git a/net/irda/irlmp_frame.c b/net/irda/irlmp_frame.c
index 3750884094d..062e63b1c5c 100644
--- a/net/irda/irlmp_frame.c
+++ b/net/irda/irlmp_frame.c
@@ -448,7 +448,7 @@ static struct lsap_cb *irlmp_find_lsap(struct lap_cb *self, __u8 dlsap_sel,
(self->cache.slsap_sel == slsap_sel) &&
(self->cache.dlsap_sel == dlsap_sel))
{
- return (self->cache.lsap);
+ return self->cache.lsap;
}
#endif
diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h
index 4300df35d37..0d82ff5aeff 100644
--- a/net/irda/irnet/irnet.h
+++ b/net/irda/irnet/irnet.h
@@ -458,6 +458,8 @@ typedef struct irnet_socket
int disco_index; /* Last read in the discovery log */
int disco_number; /* Size of the discovery log */
+ struct mutex lock;
+
} irnet_socket;
/*
diff --git a/net/irda/irnet/irnet_irda.c b/net/irda/irnet/irnet_irda.c
index e98e40d76f4..7f17a8020e8 100644
--- a/net/irda/irnet/irnet_irda.c
+++ b/net/irda/irnet/irnet_irda.c
@@ -238,7 +238,7 @@ irnet_ias_to_tsap(irnet_socket * self,
DEXIT(IRDA_SR_TRACE, "\n");
/* Return the TSAP */
- return(dtsap_sel);
+ return dtsap_sel;
}
/*------------------------------------------------------------------*/
@@ -301,7 +301,7 @@ irnet_connect_tsap(irnet_socket * self)
{
clear_bit(0, &self->ttp_connect);
DERROR(IRDA_SR_ERROR, "connect aborted!\n");
- return(err);
+ return err;
}
/* Connect to remote device */
@@ -312,7 +312,7 @@ irnet_connect_tsap(irnet_socket * self)
{
clear_bit(0, &self->ttp_connect);
DERROR(IRDA_SR_ERROR, "connect aborted!\n");
- return(err);
+ return err;
}
/* The above call is non-blocking.
@@ -321,7 +321,7 @@ irnet_connect_tsap(irnet_socket * self)
* See you there ;-) */
DEXIT(IRDA_SR_TRACE, "\n");
- return(err);
+ return err;
}
/*------------------------------------------------------------------*/
@@ -362,10 +362,10 @@ irnet_discover_next_daddr(irnet_socket * self)
/* The above request is non-blocking.
* After a while, IrDA will call us back in irnet_discovervalue_confirm()
* We will then call irnet_ias_to_tsap() and come back here again... */
- return(0);
+ return 0;
}
else
- return(1);
+ return 1;
}
/*------------------------------------------------------------------*/
@@ -436,7 +436,7 @@ irnet_discover_daddr_and_lsap_sel(irnet_socket * self)
/* Follow me in irnet_discovervalue_confirm() */
DEXIT(IRDA_SR_TRACE, "\n");
- return(0);
+ return 0;
}
/*------------------------------------------------------------------*/
@@ -485,7 +485,7 @@ irnet_dname_to_daddr(irnet_socket * self)
/* No luck ! */
DEBUG(IRDA_SR_INFO, "cannot discover device ``%s'' !!!\n", self->rname);
kfree(discoveries);
- return(-EADDRNOTAVAIL);
+ return -EADDRNOTAVAIL;
}
@@ -527,7 +527,7 @@ irda_irnet_create(irnet_socket * self)
INIT_WORK(&self->disconnect_work, irnet_ppp_disconnect);
DEXIT(IRDA_SOCK_TRACE, "\n");
- return(0);
+ return 0;
}
/*------------------------------------------------------------------*/
@@ -601,7 +601,7 @@ irda_irnet_connect(irnet_socket * self)
* We will finish the connection procedure in irnet_connect_tsap().
*/
DEXIT(IRDA_SOCK_TRACE, "\n");
- return(0);
+ return 0;
}
/*------------------------------------------------------------------*/
@@ -733,7 +733,7 @@ irnet_daddr_to_dname(irnet_socket * self)
/* No luck ! */
DEXIT(IRDA_SERV_INFO, ": cannot discover device 0x%08x !!!\n", self->daddr);
kfree(discoveries);
- return(-EADDRNOTAVAIL);
+ return -EADDRNOTAVAIL;
}
/*------------------------------------------------------------------*/
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index 36f43cac8ea..7fa86373de4 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -166,7 +166,7 @@ irnet_ctrl_write(irnet_socket * ap,
}
/* Success : we have parsed all commands successfully */
- return(count);
+ return count;
}
#ifdef INITIAL_DISCOVERY
@@ -300,7 +300,7 @@ irnet_ctrl_read(irnet_socket * ap,
}
DEXIT(CTRL_TRACE, "\n");
- return(strlen(event));
+ return strlen(event);
}
#endif /* INITIAL_DISCOVERY */
@@ -409,7 +409,7 @@ irnet_ctrl_read(irnet_socket * ap,
}
DEXIT(CTRL_TRACE, "\n");
- return(strlen(event));
+ return strlen(event);
}
/*------------------------------------------------------------------*/
@@ -480,7 +480,6 @@ dev_irnet_open(struct inode * inode,
ap = kzalloc(sizeof(*ap), GFP_KERNEL);
DABORT(ap == NULL, -ENOMEM, FS_ERROR, "Can't allocate struct irnet...\n");
- lock_kernel();
/* initialize the irnet structure */
ap->file = file;
@@ -502,18 +501,20 @@ dev_irnet_open(struct inode * inode,
{
DERROR(FS_ERROR, "Can't setup IrDA link...\n");
kfree(ap);
- unlock_kernel();
+
return err;
}
/* For the control channel */
ap->event_index = irnet_events.index; /* Cancel all past events */
+ mutex_init(&ap->lock);
+
/* Put our stuff where we will be able to find it later */
file->private_data = ap;
DEXIT(FS_TRACE, " - ap=0x%p\n", ap);
- unlock_kernel();
+
return 0;
}
@@ -623,7 +624,7 @@ dev_irnet_poll(struct file * file,
mask |= irnet_ctrl_poll(ap, file, wait);
DEXIT(FS_TRACE, " - mask=0x%X\n", mask);
- return(mask);
+ return mask;
}
/*------------------------------------------------------------------*/
@@ -664,7 +665,9 @@ dev_irnet_ioctl(
{
DEBUG(FS_INFO, "Entering PPP discipline.\n");
/* PPP channel setup (ap->chan in configured in dev_irnet_open())*/
- lock_kernel();
+ if (mutex_lock_interruptible(&ap->lock))
+ return -EINTR;
+
err = ppp_register_channel(&ap->chan);
if(err == 0)
{
@@ -677,14 +680,17 @@ dev_irnet_ioctl(
}
else
DERROR(FS_ERROR, "Can't setup PPP channel...\n");
- unlock_kernel();
+
+ mutex_unlock(&ap->lock);
}
else
{
/* In theory, should be N_TTY */
DEBUG(FS_INFO, "Exiting PPP discipline.\n");
/* Disconnect from the generic PPP layer */
- lock_kernel();
+ if (mutex_lock_interruptible(&ap->lock))
+ return -EINTR;
+
if(ap->ppp_open)
{
ap->ppp_open = 0;
@@ -693,24 +699,31 @@ dev_irnet_ioctl(
else
DERROR(FS_ERROR, "Channel not registered !\n");
err = 0;
- unlock_kernel();
+
+ mutex_unlock(&ap->lock);
}
break;
/* Query PPP channel and unit number */
case PPPIOCGCHAN:
- lock_kernel();
+ if (mutex_lock_interruptible(&ap->lock))
+ return -EINTR;
+
if(ap->ppp_open && !put_user(ppp_channel_index(&ap->chan),
(int __user *)argp))
err = 0;
- unlock_kernel();
+
+ mutex_unlock(&ap->lock);
break;
case PPPIOCGUNIT:
- lock_kernel();
+ if (mutex_lock_interruptible(&ap->lock))
+ return -EINTR;
+
if(ap->ppp_open && !put_user(ppp_unit_number(&ap->chan),
(int __user *)argp))
err = 0;
- unlock_kernel();
+
+ mutex_unlock(&ap->lock);
break;
/* All these ioctls can be passed both directly and from ppp_generic,
@@ -730,9 +743,12 @@ dev_irnet_ioctl(
if(!capable(CAP_NET_ADMIN))
err = -EPERM;
else {
- lock_kernel();
+ if (mutex_lock_interruptible(&ap->lock))
+ return -EINTR;
+
err = ppp_irnet_ioctl(&ap->chan, cmd, arg);
- unlock_kernel();
+
+ mutex_unlock(&ap->lock);
}
break;
@@ -740,7 +756,9 @@ dev_irnet_ioctl(
/* Get termios */
case TCGETS:
DEBUG(FS_INFO, "Get termios.\n");
- lock_kernel();
+ if (mutex_lock_interruptible(&ap->lock))
+ return -EINTR;
+
#ifndef TCGETS2
if(!kernel_termios_to_user_termios((struct termios __user *)argp, &ap->termios))
err = 0;
@@ -748,12 +766,15 @@ dev_irnet_ioctl(
if(kernel_termios_to_user_termios_1((struct termios __user *)argp, &ap->termios))
err = 0;
#endif
- unlock_kernel();
+
+ mutex_unlock(&ap->lock);
break;
/* Set termios */
case TCSETSF:
DEBUG(FS_INFO, "Set termios.\n");
- lock_kernel();
+ if (mutex_lock_interruptible(&ap->lock))
+ return -EINTR;
+
#ifndef TCGETS2
if(!user_termios_to_kernel_termios(&ap->termios, (struct termios __user *)argp))
err = 0;
@@ -761,7 +782,8 @@ dev_irnet_ioctl(
if(!user_termios_to_kernel_termios_1(&ap->termios, (struct termios __user *)argp))
err = 0;
#endif
- unlock_kernel();
+
+ mutex_unlock(&ap->lock);
break;
/* Set DTR/RTS */
@@ -784,9 +806,10 @@ dev_irnet_ioctl(
* We should also worry that we don't accept junk here and that
* we get rid of our own buffers */
#ifdef FLUSH_TO_PPP
- lock_kernel();
+ if (mutex_lock_interruptible(&ap->lock))
+ return -EINTR;
ppp_output_wakeup(&ap->chan);
- unlock_kernel();
+ mutex_unlock(&ap->lock);
#endif /* FLUSH_TO_PPP */
err = 0;
break;
diff --git a/net/irda/irnet/irnet_ppp.h b/net/irda/irnet/irnet_ppp.h
index b5df2418f90..940225866da 100644
--- a/net/irda/irnet/irnet_ppp.h
+++ b/net/irda/irnet/irnet_ppp.h
@@ -103,7 +103,8 @@ static const struct file_operations irnet_device_fops =
.poll = dev_irnet_poll,
.unlocked_ioctl = dev_irnet_ioctl,
.open = dev_irnet_open,
- .release = dev_irnet_close
+ .release = dev_irnet_close,
+ .llseek = noop_llseek,
/* Also : llseek, readdir, mmap, flush, fsync, fasync, lock, readv, writev */
};
diff --git a/net/irda/parameters.c b/net/irda/parameters.c
index fc1a20565e2..71cd38c1a67 100644
--- a/net/irda/parameters.c
+++ b/net/irda/parameters.c
@@ -298,6 +298,8 @@ static int irda_extract_string(void *self, __u8 *buf, int len, __u8 pi,
p.pi = pi; /* In case handler needs to know */
p.pl = buf[1]; /* Extract length of value */
+ if (p.pl > 32)
+ p.pl = 32;
IRDA_DEBUG(2, "%s(), pi=%#x, pl=%d\n", __func__,
p.pi, p.pl);
@@ -318,7 +320,7 @@ static int irda_extract_string(void *self, __u8 *buf, int len, __u8 pi,
(__u8) str[0], (__u8) str[1]);
/* Null terminate string */
- str[p.pl+1] = '\0';
+ str[p.pl] = '\0';
p.pv.c = str; /* Handler will need to take a copy */
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 43040e97c47..d87c22df6f1 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -565,12 +565,12 @@ pfkey_proto2satype(uint16_t proto)
static uint8_t pfkey_proto_to_xfrm(uint8_t proto)
{
- return (proto == IPSEC_PROTO_ANY ? 0 : proto);
+ return proto == IPSEC_PROTO_ANY ? 0 : proto;
}
static uint8_t pfkey_proto_from_xfrm(uint8_t proto)
{
- return (proto ? proto : IPSEC_PROTO_ANY);
+ return proto ? proto : IPSEC_PROTO_ANY;
}
static inline int pfkey_sockaddr_len(sa_family_t family)
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 58c6c4cda73..8d9ce0accc9 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -132,7 +132,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb,
printk("\n");
}
- if (data_len < ETH_HLEN)
+ if (!pskb_may_pull(skb, sizeof(ETH_HLEN)))
goto error;
secpath_reset(skb);
@@ -144,7 +144,6 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb,
nf_reset(skb);
if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) {
- dev->last_rx = jiffies;
dev->stats.rx_packets++;
dev->stats.rx_bytes += data_len;
} else
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 226a0ae3bcf..1c770c0644d 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -65,9 +65,7 @@ static struct sock *__l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif
continue;
if ((l2tp->conn_id == tunnel_id) &&
-#ifdef CONFIG_NET_NS
- (sk->sk_net == net) &&
-#endif
+ net_eq(sock_net(sk), net) &&
!(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
!(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
goto found;
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index ff954b3e94b..39a21d0c61c 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1768,7 +1768,7 @@ static const struct proto_ops pppol2tp_ops = {
.ioctl = pppox_ioctl,
};
-static struct pppox_proto pppol2tp_proto = {
+static const struct pppox_proto pppol2tp_proto = {
.create = pppol2tp_create,
.ioctl = pppol2tp_ioctl
};
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 023ba820236..58261299821 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -1024,7 +1024,8 @@ static int llc_ui_setsockopt(struct socket *sock, int level, int optname,
{
struct sock *sk = sock->sk;
struct llc_sock *llc = llc_sk(sk);
- int rc = -EINVAL, opt;
+ unsigned int opt;
+ int rc = -EINVAL;
lock_sock(sk);
if (unlikely(level != SOL_LLC || optlen != sizeof(int)))
diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c
index e4dae0244d7..cf4aea3ba30 100644
--- a/net/llc/llc_station.c
+++ b/net/llc/llc_station.c
@@ -689,7 +689,7 @@ static void llc_station_rcv(struct sk_buff *skb)
int __init llc_station_init(void)
{
- u16 rc = -ENOBUFS;
+ int rc = -ENOBUFS;
struct sk_buff *skb;
struct llc_station_state_ev *ev;
diff --git a/net/mac80211/aes_ccm.c b/net/mac80211/aes_ccm.c
index a87cb3ba2df..d2b03e0851e 100644
--- a/net/mac80211/aes_ccm.c
+++ b/net/mac80211/aes_ccm.c
@@ -138,10 +138,8 @@ struct crypto_cipher *ieee80211_aes_key_setup_encrypt(const u8 key[])
struct crypto_cipher *tfm;
tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm))
- return NULL;
-
- crypto_cipher_setkey(tfm, key, ALG_CCMP_KEY_LEN);
+ if (!IS_ERR(tfm))
+ crypto_cipher_setkey(tfm, key, ALG_CCMP_KEY_LEN);
return tfm;
}
diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c
index 3d097b3d7b6..b4d66cca76d 100644
--- a/net/mac80211/aes_cmac.c
+++ b/net/mac80211/aes_cmac.c
@@ -119,10 +119,8 @@ struct crypto_cipher * ieee80211_aes_cmac_key_setup(const u8 key[])
struct crypto_cipher *tfm;
tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm))
- return NULL;
-
- crypto_cipher_setkey(tfm, key, AES_CMAC_KEY_LEN);
+ if (!IS_ERR(tfm))
+ crypto_cipher_setkey(tfm, key, AES_CMAC_KEY_LEN);
return tfm;
}
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 965b272499f..720b7a84af5 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -56,7 +56,7 @@ static void ieee80211_free_tid_rx(struct rcu_head *h)
}
void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
- u16 initiator, u16 reason)
+ u16 initiator, u16 reason, bool tx)
{
struct ieee80211_local *local = sta->local;
struct tid_ampdu_rx *tid_rx;
@@ -81,20 +81,21 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
"aggregation for tid %d\n", tid);
/* check if this is a self generated aggregation halt */
- if (initiator == WLAN_BACK_RECIPIENT)
+ if (initiator == WLAN_BACK_RECIPIENT && tx)
ieee80211_send_delba(sta->sdata, sta->sta.addr,
tid, 0, reason);
del_timer_sync(&tid_rx->session_timer);
+ del_timer_sync(&tid_rx->reorder_timer);
call_rcu(&tid_rx->rcu_head, ieee80211_free_tid_rx);
}
void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
- u16 initiator, u16 reason)
+ u16 initiator, u16 reason, bool tx)
{
mutex_lock(&sta->ampdu_mlme.mtx);
- ___ieee80211_stop_rx_ba_session(sta, tid, initiator, reason);
+ ___ieee80211_stop_rx_ba_session(sta, tid, initiator, reason, tx);
mutex_unlock(&sta->ampdu_mlme.mtx);
}
@@ -120,6 +121,20 @@ static void sta_rx_agg_session_timer_expired(unsigned long data)
ieee80211_queue_work(&sta->local->hw, &sta->ampdu_mlme.work);
}
+static void sta_rx_agg_reorder_timer_expired(unsigned long data)
+{
+ u8 *ptid = (u8 *)data;
+ u8 *timer_to_id = ptid - *ptid;
+ struct sta_info *sta = container_of(timer_to_id, struct sta_info,
+ timer_to_tid[0]);
+
+ rcu_read_lock();
+ spin_lock(&sta->lock);
+ ieee80211_release_reorder_timeout(sta, *ptid);
+ spin_unlock(&sta->lock);
+ rcu_read_unlock();
+}
+
static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid,
u8 dialog_token, u16 status, u16 policy,
u16 buf_size, u16 timeout)
@@ -251,11 +266,18 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
goto end;
}
+ spin_lock_init(&tid_agg_rx->reorder_lock);
+
/* rx timer */
tid_agg_rx->session_timer.function = sta_rx_agg_session_timer_expired;
tid_agg_rx->session_timer.data = (unsigned long)&sta->timer_to_tid[tid];
init_timer(&tid_agg_rx->session_timer);
+ /* rx reorder timer */
+ tid_agg_rx->reorder_timer.function = sta_rx_agg_reorder_timer_expired;
+ tid_agg_rx->reorder_timer.data = (unsigned long)&sta->timer_to_tid[tid];
+ init_timer(&tid_agg_rx->reorder_timer);
+
/* prepare reordering buffer */
tid_agg_rx->reorder_buf =
kcalloc(buf_size, sizeof(struct sk_buff *), GFP_ATOMIC);
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index c893f236ace..d4679b265ba 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -145,7 +145,8 @@ static void kfree_tid_tx(struct rcu_head *rcu_head)
}
int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
- enum ieee80211_back_parties initiator)
+ enum ieee80211_back_parties initiator,
+ bool tx)
{
struct ieee80211_local *local = sta->local;
struct tid_ampdu_tx *tid_tx = sta->ampdu_mlme.tid_tx[tid];
@@ -175,6 +176,8 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
set_bit(HT_AGG_STATE_STOPPING, &tid_tx->state);
+ del_timer_sync(&tid_tx->addba_resp_timer);
+
/*
* After this packets are no longer handed right through
* to the driver but are put onto tid_tx->pending instead,
@@ -183,6 +186,7 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
clear_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state);
tid_tx->stop_initiator = initiator;
+ tid_tx->tx_stop = tx;
ret = drv_ampdu_action(local, sta->sdata,
IEEE80211_AMPDU_TX_STOP,
@@ -575,13 +579,14 @@ void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif,
EXPORT_SYMBOL(ieee80211_start_tx_ba_cb_irqsafe);
int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
- enum ieee80211_back_parties initiator)
+ enum ieee80211_back_parties initiator,
+ bool tx)
{
int ret;
mutex_lock(&sta->ampdu_mlme.mtx);
- ret = ___ieee80211_stop_tx_ba_session(sta, tid, initiator);
+ ret = ___ieee80211_stop_tx_ba_session(sta, tid, initiator, tx);
mutex_unlock(&sta->ampdu_mlme.mtx);
@@ -670,7 +675,7 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid)
goto unlock_sta;
}
- if (tid_tx->stop_initiator == WLAN_BACK_INITIATOR)
+ if (tid_tx->stop_initiator == WLAN_BACK_INITIATOR && tid_tx->tx_stop)
ieee80211_send_delba(sta->sdata, ra, tid,
WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE);
@@ -770,7 +775,8 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
sta->ampdu_mlme.addba_req_num[tid] = 0;
} else {
- ___ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_INITIATOR);
+ ___ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_INITIATOR,
+ true);
}
out:
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 29ac8e1a509..18bd0e55060 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -19,33 +19,6 @@
#include "rate.h"
#include "mesh.h"
-static bool nl80211_type_check(enum nl80211_iftype type)
-{
- switch (type) {
- case NL80211_IFTYPE_ADHOC:
- case NL80211_IFTYPE_STATION:
- case NL80211_IFTYPE_MONITOR:
-#ifdef CONFIG_MAC80211_MESH
- case NL80211_IFTYPE_MESH_POINT:
-#endif
- case NL80211_IFTYPE_AP:
- case NL80211_IFTYPE_AP_VLAN:
- case NL80211_IFTYPE_WDS:
- return true;
- default:
- return false;
- }
-}
-
-static bool nl80211_params_check(enum nl80211_iftype type,
- struct vif_params *params)
-{
- if (!nl80211_type_check(type))
- return false;
-
- return true;
-}
-
static int ieee80211_add_iface(struct wiphy *wiphy, char *name,
enum nl80211_iftype type, u32 *flags,
struct vif_params *params)
@@ -55,9 +28,6 @@ static int ieee80211_add_iface(struct wiphy *wiphy, char *name,
struct ieee80211_sub_if_data *sdata;
int err;
- if (!nl80211_params_check(type, params))
- return -EINVAL;
-
err = ieee80211_if_add(local, name, &dev, type, params);
if (err || type != NL80211_IFTYPE_MONITOR || !flags)
return err;
@@ -82,12 +52,6 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
int ret;
- if (ieee80211_sdata_running(sdata))
- return -EBUSY;
-
- if (!nl80211_params_check(type, params))
- return -EINVAL;
-
ret = ieee80211_if_change_type(sdata, type);
if (ret)
return ret;
@@ -104,54 +68,71 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
params && params->use_4addr >= 0)
sdata->u.mgd.use_4addr = params->use_4addr;
- if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags)
- sdata->u.mntr_flags = *flags;
+ if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags) {
+ struct ieee80211_local *local = sdata->local;
+
+ if (ieee80211_sdata_running(sdata)) {
+ /*
+ * Prohibit MONITOR_FLAG_COOK_FRAMES to be
+ * changed while the interface is up.
+ * Else we would need to add a lot of cruft
+ * to update everything:
+ * cooked_mntrs, monitor and all fif_* counters
+ * reconfigure hardware
+ */
+ if ((*flags & MONITOR_FLAG_COOK_FRAMES) !=
+ (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES))
+ return -EBUSY;
+
+ ieee80211_adjust_monitor_flags(sdata, -1);
+ sdata->u.mntr_flags = *flags;
+ ieee80211_adjust_monitor_flags(sdata, 1);
+
+ ieee80211_configure_filter(local);
+ } else {
+ /*
+ * Because the interface is down, ieee80211_do_stop
+ * and ieee80211_do_open take care of "everything"
+ * mentioned in the comment above.
+ */
+ sdata->u.mntr_flags = *flags;
+ }
+ }
return 0;
}
static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
- u8 key_idx, const u8 *mac_addr,
+ u8 key_idx, bool pairwise, const u8 *mac_addr,
struct key_params *params)
{
- struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
struct sta_info *sta = NULL;
- enum ieee80211_key_alg alg;
struct ieee80211_key *key;
int err;
- if (!netif_running(dev))
+ if (!ieee80211_sdata_running(sdata))
return -ENETDOWN;
- sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-
+ /* reject WEP and TKIP keys if WEP failed to initialize */
switch (params->cipher) {
case WLAN_CIPHER_SUITE_WEP40:
- case WLAN_CIPHER_SUITE_WEP104:
- alg = ALG_WEP;
- break;
case WLAN_CIPHER_SUITE_TKIP:
- alg = ALG_TKIP;
- break;
- case WLAN_CIPHER_SUITE_CCMP:
- alg = ALG_CCMP;
- break;
- case WLAN_CIPHER_SUITE_AES_CMAC:
- alg = ALG_AES_CMAC;
+ case WLAN_CIPHER_SUITE_WEP104:
+ if (IS_ERR(sdata->local->wep_tx_tfm))
+ return -EINVAL;
break;
default:
- return -EINVAL;
+ break;
}
- /* reject WEP and TKIP keys if WEP failed to initialize */
- if ((alg == ALG_WEP || alg == ALG_TKIP) &&
- IS_ERR(sdata->local->wep_tx_tfm))
- return -EINVAL;
+ key = ieee80211_key_alloc(params->cipher, key_idx, params->key_len,
+ params->key, params->seq_len, params->seq);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
- key = ieee80211_key_alloc(alg, key_idx, params->key_len, params->key,
- params->seq_len, params->seq);
- if (!key)
- return -ENOMEM;
+ if (pairwise)
+ key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE;
mutex_lock(&sdata->local->sta_mtx);
@@ -164,9 +145,10 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
}
}
- ieee80211_key_link(key, sdata, sta);
+ err = ieee80211_key_link(key, sdata, sta);
+ if (err)
+ ieee80211_key_free(sdata->local, key);
- err = 0;
out_unlock:
mutex_unlock(&sdata->local->sta_mtx);
@@ -174,7 +156,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
}
static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
- u8 key_idx, const u8 *mac_addr)
+ u8 key_idx, bool pairwise, const u8 *mac_addr)
{
struct ieee80211_sub_if_data *sdata;
struct sta_info *sta;
@@ -191,10 +173,17 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
if (!sta)
goto out_unlock;
- if (sta->key) {
- ieee80211_key_free(sdata->local, sta->key);
- WARN_ON(sta->key);
- ret = 0;
+ if (pairwise) {
+ if (sta->ptk) {
+ ieee80211_key_free(sdata->local, sta->ptk);
+ ret = 0;
+ }
+ } else {
+ if (sta->gtk[key_idx]) {
+ ieee80211_key_free(sdata->local,
+ sta->gtk[key_idx]);
+ ret = 0;
+ }
}
goto out_unlock;
@@ -216,7 +205,8 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
}
static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
- u8 key_idx, const u8 *mac_addr, void *cookie,
+ u8 key_idx, bool pairwise, const u8 *mac_addr,
+ void *cookie,
void (*callback)(void *cookie,
struct key_params *params))
{
@@ -224,7 +214,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
struct sta_info *sta = NULL;
u8 seq[6] = {0};
struct key_params params;
- struct ieee80211_key *key;
+ struct ieee80211_key *key = NULL;
u32 iv32;
u16 iv16;
int err = -ENOENT;
@@ -238,7 +228,10 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
if (!sta)
goto out;
- key = sta->key;
+ if (pairwise)
+ key = sta->ptk;
+ else if (key_idx < NUM_DEFAULT_KEYS)
+ key = sta->gtk[key_idx];
} else
key = sdata->keys[key_idx];
@@ -247,10 +240,10 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
memset(&params, 0, sizeof(params));
- switch (key->conf.alg) {
- case ALG_TKIP:
- params.cipher = WLAN_CIPHER_SUITE_TKIP;
+ params.cipher = key->conf.cipher;
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_TKIP:
iv32 = key->u.tkip.tx.iv32;
iv16 = key->u.tkip.tx.iv16;
@@ -268,8 +261,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
params.seq = seq;
params.seq_len = 6;
break;
- case ALG_CCMP:
- params.cipher = WLAN_CIPHER_SUITE_CCMP;
+ case WLAN_CIPHER_SUITE_CCMP:
seq[0] = key->u.ccmp.tx_pn[5];
seq[1] = key->u.ccmp.tx_pn[4];
seq[2] = key->u.ccmp.tx_pn[3];
@@ -279,14 +271,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
params.seq = seq;
params.seq_len = 6;
break;
- case ALG_WEP:
- if (key->conf.keylen == 5)
- params.cipher = WLAN_CIPHER_SUITE_WEP40;
- else
- params.cipher = WLAN_CIPHER_SUITE_WEP104;
- break;
- case ALG_AES_CMAC:
- params.cipher = WLAN_CIPHER_SUITE_AES_CMAC;
+ case WLAN_CIPHER_SUITE_AES_CMAC:
seq[0] = key->u.aes_cmac.tx_pn[5];
seq[1] = key->u.aes_cmac.tx_pn[4];
seq[2] = key->u.aes_cmac.tx_pn[3];
@@ -342,13 +327,19 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
STATION_INFO_TX_BYTES |
STATION_INFO_RX_PACKETS |
STATION_INFO_TX_PACKETS |
- STATION_INFO_TX_BITRATE;
+ STATION_INFO_TX_RETRIES |
+ STATION_INFO_TX_FAILED |
+ STATION_INFO_TX_BITRATE |
+ STATION_INFO_RX_DROP_MISC;
sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx);
sinfo->rx_bytes = sta->rx_bytes;
sinfo->tx_bytes = sta->tx_bytes;
sinfo->rx_packets = sta->rx_packets;
sinfo->tx_packets = sta->tx_packets;
+ sinfo->tx_retries = sta->tx_retry_count;
+ sinfo->tx_failed = sta->tx_retry_failed;
+ sinfo->rx_dropped_misc = sta->rx_dropped;
if ((sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) ||
(sta->local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)) {
@@ -634,6 +625,7 @@ static void sta_apply_parameters(struct ieee80211_local *local,
struct sta_info *sta,
struct station_parameters *params)
{
+ unsigned long flags;
u32 rates;
int i, j;
struct ieee80211_supported_band *sband;
@@ -642,7 +634,7 @@ static void sta_apply_parameters(struct ieee80211_local *local,
sband = local->hw.wiphy->bands[local->oper_channel->band];
- spin_lock_bh(&sta->lock);
+ spin_lock_irqsave(&sta->flaglock, flags);
mask = params->sta_flags_mask;
set = params->sta_flags_set;
@@ -669,7 +661,7 @@ static void sta_apply_parameters(struct ieee80211_local *local,
if (set & BIT(NL80211_STA_FLAG_MFP))
sta->flags |= WLAN_STA_MFP;
}
- spin_unlock_bh(&sta->lock);
+ spin_unlock_irqrestore(&sta->flaglock, flags);
/*
* cfg80211 validates this (1-2007) and allows setting the AID
@@ -1143,9 +1135,9 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy,
p.uapsd = false;
if (drv_conf_tx(local, params->queue, &p)) {
- printk(KERN_DEBUG "%s: failed to set TX queue "
- "parameters for queue %d\n",
- wiphy_name(local->hw.wiphy), params->queue);
+ wiphy_debug(local->hw.wiphy,
+ "failed to set TX queue parameters for queue %d\n",
+ params->queue);
return -EINVAL;
}
@@ -1207,15 +1199,26 @@ static int ieee80211_scan(struct wiphy *wiphy,
struct net_device *dev,
struct cfg80211_scan_request *req)
{
- struct ieee80211_sub_if_data *sdata;
-
- sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
- if (sdata->vif.type != NL80211_IFTYPE_STATION &&
- sdata->vif.type != NL80211_IFTYPE_ADHOC &&
- sdata->vif.type != NL80211_IFTYPE_MESH_POINT &&
- (sdata->vif.type != NL80211_IFTYPE_AP || sdata->u.ap.beacon))
+ switch (ieee80211_vif_type_p2p(&sdata->vif)) {
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_MESH_POINT:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ break;
+ case NL80211_IFTYPE_P2P_GO:
+ if (sdata->local->ops->hw_scan)
+ break;
+ /* FIXME: implement NoA while scanning in software */
+ return -EOPNOTSUPP;
+ case NL80211_IFTYPE_AP:
+ if (sdata->u.ap.beacon)
+ return -EOPNOTSUPP;
+ break;
+ default:
return -EOPNOTSUPP;
+ }
return ieee80211_request_scan(sdata, req);
}
@@ -1362,7 +1365,7 @@ static int ieee80211_get_tx_power(struct wiphy *wiphy, int *dbm)
}
static int ieee80211_set_wds_peer(struct wiphy *wiphy, struct net_device *dev,
- u8 *addr)
+ const u8 *addr)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
@@ -1411,7 +1414,7 @@ int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata,
if (!sdata->u.mgd.associated ||
sdata->vif.bss_conf.channel_type == NL80211_CHAN_NO_HT) {
mutex_lock(&sdata->local->iflist_mtx);
- ieee80211_recalc_smps(sdata->local, sdata);
+ ieee80211_recalc_smps(sdata->local);
mutex_unlock(&sdata->local->iflist_mtx);
return 0;
}
@@ -1541,11 +1544,11 @@ static int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy,
return ieee80211_wk_cancel_remain_on_channel(sdata, cookie);
}
-static int ieee80211_action(struct wiphy *wiphy, struct net_device *dev,
- struct ieee80211_channel *chan,
- enum nl80211_channel_type channel_type,
- bool channel_type_valid,
- const u8 *buf, size_t len, u64 *cookie)
+static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct net_device *dev,
+ struct ieee80211_channel *chan,
+ enum nl80211_channel_type channel_type,
+ bool channel_type_valid,
+ const u8 *buf, size_t len, u64 *cookie)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
struct ieee80211_local *local = sdata->local;
@@ -1566,7 +1569,11 @@ static int ieee80211_action(struct wiphy *wiphy, struct net_device *dev,
switch (sdata->vif.type) {
case NL80211_IFTYPE_ADHOC:
- if (mgmt->u.action.category == WLAN_CATEGORY_PUBLIC)
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_P2P_GO:
+ if (!ieee80211_is_action(mgmt->frame_control) ||
+ mgmt->u.action.category == WLAN_CATEGORY_PUBLIC)
break;
rcu_read_lock();
sta = sta_info_get(sdata, mgmt->da);
@@ -1575,8 +1582,7 @@ static int ieee80211_action(struct wiphy *wiphy, struct net_device *dev,
return -ENOLINK;
break;
case NL80211_IFTYPE_STATION:
- if (!(sdata->u.mgd.flags & IEEE80211_STA_MFP_ENABLED))
- flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+ case NL80211_IFTYPE_P2P_CLIENT:
break;
default:
return -EOPNOTSUPP;
@@ -1598,6 +1604,23 @@ static int ieee80211_action(struct wiphy *wiphy, struct net_device *dev,
return 0;
}
+static void ieee80211_mgmt_frame_register(struct wiphy *wiphy,
+ struct net_device *dev,
+ u16 frame_type, bool reg)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+
+ if (frame_type != (IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ))
+ return;
+
+ if (reg)
+ local->probe_req_reg++;
+ else
+ local->probe_req_reg--;
+
+ ieee80211_queue_work(&local->hw, &local->reconfig_filter);
+}
+
struct cfg80211_ops mac80211_config_ops = {
.add_virtual_intf = ieee80211_add_iface,
.del_virtual_intf = ieee80211_del_iface,
@@ -1647,6 +1670,7 @@ struct cfg80211_ops mac80211_config_ops = {
.set_bitrate_mask = ieee80211_set_bitrate_mask,
.remain_on_channel = ieee80211_remain_on_channel,
.cancel_remain_on_channel = ieee80211_cancel_remain_on_channel,
- .action = ieee80211_action,
+ .mgmt_tx = ieee80211_mgmt_tx,
.set_cqm_rssi_config = ieee80211_set_cqm_rssi_config,
+ .mgmt_frame_register = ieee80211_mgmt_frame_register,
};
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 32be11e4c4d..5b24740fc0b 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -11,7 +11,7 @@ __ieee80211_get_channel_mode(struct ieee80211_local *local,
{
struct ieee80211_sub_if_data *sdata;
- WARN_ON(!mutex_is_locked(&local->iflist_mtx));
+ lockdep_assert_held(&local->iflist_mtx);
list_for_each_entry(sdata, &local->interfaces, list) {
if (sdata == ignore)
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index a694c593ff6..18260aa99c5 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -36,6 +36,7 @@ static ssize_t name## _read(struct file *file, char __user *userbuf, \
static const struct file_operations name## _ops = { \
.read = name## _read, \
.open = mac80211_open_file_generic, \
+ .llseek = generic_file_llseek, \
};
#define DEBUGFS_ADD(name) \
@@ -85,13 +86,15 @@ static ssize_t tsf_write(struct file *file,
if (strncmp(buf, "reset", 5) == 0) {
if (local->ops->reset_tsf) {
drv_reset_tsf(local);
- printk(KERN_INFO "%s: debugfs reset TSF\n", wiphy_name(local->hw.wiphy));
+ wiphy_info(local->hw.wiphy, "debugfs reset TSF\n");
}
} else {
tsf = simple_strtoul(buf, NULL, 0);
if (local->ops->set_tsf) {
drv_set_tsf(local, tsf);
- printk(KERN_INFO "%s: debugfs set TSF to %#018llx\n", wiphy_name(local->hw.wiphy), tsf);
+ wiphy_info(local->hw.wiphy,
+ "debugfs set TSF to %#018llx\n", tsf);
+
}
}
@@ -101,7 +104,8 @@ static ssize_t tsf_write(struct file *file,
static const struct file_operations tsf_ops = {
.read = tsf_read,
.write = tsf_write,
- .open = mac80211_open_file_generic
+ .open = mac80211_open_file_generic,
+ .llseek = default_llseek,
};
static ssize_t reset_write(struct file *file, const char __user *user_buf,
@@ -120,6 +124,7 @@ static ssize_t reset_write(struct file *file, const char __user *user_buf,
static const struct file_operations reset_ops = {
.write = reset_write,
.open = mac80211_open_file_generic,
+ .llseek = noop_llseek,
};
static ssize_t noack_read(struct file *file, char __user *user_buf,
@@ -155,7 +160,8 @@ static ssize_t noack_write(struct file *file,
static const struct file_operations noack_ops = {
.read = noack_read,
.write = noack_write,
- .open = mac80211_open_file_generic
+ .open = mac80211_open_file_generic,
+ .llseek = default_llseek,
};
static ssize_t uapsd_queues_read(struct file *file, char __user *user_buf,
@@ -201,7 +207,8 @@ static ssize_t uapsd_queues_write(struct file *file,
static const struct file_operations uapsd_queues_ops = {
.read = uapsd_queues_read,
.write = uapsd_queues_write,
- .open = mac80211_open_file_generic
+ .open = mac80211_open_file_generic,
+ .llseek = default_llseek,
};
static ssize_t uapsd_max_sp_len_read(struct file *file, char __user *user_buf,
@@ -247,7 +254,8 @@ static ssize_t uapsd_max_sp_len_write(struct file *file,
static const struct file_operations uapsd_max_sp_len_ops = {
.read = uapsd_max_sp_len_read,
.write = uapsd_max_sp_len_write,
- .open = mac80211_open_file_generic
+ .open = mac80211_open_file_generic,
+ .llseek = default_llseek,
};
static ssize_t channel_type_read(struct file *file, char __user *user_buf,
@@ -279,7 +287,8 @@ static ssize_t channel_type_read(struct file *file, char __user *user_buf,
static const struct file_operations channel_type_ops = {
.read = channel_type_read,
- .open = mac80211_open_file_generic
+ .open = mac80211_open_file_generic,
+ .llseek = default_llseek,
};
static ssize_t queues_read(struct file *file, char __user *user_buf,
@@ -302,7 +311,8 @@ static ssize_t queues_read(struct file *file, char __user *user_buf,
static const struct file_operations queues_ops = {
.read = queues_read,
- .open = mac80211_open_file_generic
+ .open = mac80211_open_file_generic,
+ .llseek = default_llseek,
};
/* statistics stuff */
@@ -346,6 +356,7 @@ static ssize_t stats_ ##name## _read(struct file *file, \
static const struct file_operations stats_ ##name## _ops = { \
.read = stats_ ##name## _read, \
.open = mac80211_open_file_generic, \
+ .llseek = generic_file_llseek, \
};
#define DEBUGFS_STATS_ADD(name, field) \
@@ -366,7 +377,6 @@ void debugfs_hw_add(struct ieee80211_local *local)
if (!phyd)
return;
- local->debugfs.stations = debugfs_create_dir("stations", phyd);
local->debugfs.keys = debugfs_create_dir("keys", phyd);
DEBUGFS_ADD(frequency);
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index fa5e76e658e..4aa47d074a7 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -32,6 +32,7 @@ static ssize_t key_##name##_read(struct file *file, \
static const struct file_operations key_ ##name## _ops = { \
.read = key_##name##_read, \
.open = mac80211_open_file_generic, \
+ .llseek = generic_file_llseek, \
}
#define KEY_FILE(name, format) \
@@ -46,6 +47,7 @@ static const struct file_operations key_ ##name## _ops = { \
static const struct file_operations key_ ##name## _ops = { \
.read = key_conf_##name##_read, \
.open = mac80211_open_file_generic, \
+ .llseek = generic_file_llseek, \
}
#define KEY_CONF_FILE(name, format) \
@@ -64,26 +66,13 @@ static ssize_t key_algorithm_read(struct file *file,
char __user *userbuf,
size_t count, loff_t *ppos)
{
- char *alg;
+ char buf[15];
struct ieee80211_key *key = file->private_data;
+ u32 c = key->conf.cipher;
- switch (key->conf.alg) {
- case ALG_WEP:
- alg = "WEP\n";
- break;
- case ALG_TKIP:
- alg = "TKIP\n";
- break;
- case ALG_CCMP:
- alg = "CCMP\n";
- break;
- case ALG_AES_CMAC:
- alg = "AES-128-CMAC\n";
- break;
- default:
- return 0;
- }
- return simple_read_from_buffer(userbuf, count, ppos, alg, strlen(alg));
+ sprintf(buf, "%.2x-%.2x-%.2x:%d\n",
+ c >> 24, (c >> 16) & 0xff, (c >> 8) & 0xff, c & 0xff);
+ return simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf));
}
KEY_OPS(algorithm);
@@ -95,21 +84,22 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf,
int len;
struct ieee80211_key *key = file->private_data;
- switch (key->conf.alg) {
- case ALG_WEP:
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
len = scnprintf(buf, sizeof(buf), "\n");
break;
- case ALG_TKIP:
+ case WLAN_CIPHER_SUITE_TKIP:
len = scnprintf(buf, sizeof(buf), "%08x %04x\n",
key->u.tkip.tx.iv32,
key->u.tkip.tx.iv16);
break;
- case ALG_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP:
tpn = key->u.ccmp.tx_pn;
len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
tpn[0], tpn[1], tpn[2], tpn[3], tpn[4], tpn[5]);
break;
- case ALG_AES_CMAC:
+ case WLAN_CIPHER_SUITE_AES_CMAC:
tpn = key->u.aes_cmac.tx_pn;
len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
tpn[0], tpn[1], tpn[2], tpn[3], tpn[4],
@@ -130,11 +120,12 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf,
int i, len;
const u8 *rpn;
- switch (key->conf.alg) {
- case ALG_WEP:
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
len = scnprintf(buf, sizeof(buf), "\n");
break;
- case ALG_TKIP:
+ case WLAN_CIPHER_SUITE_TKIP:
for (i = 0; i < NUM_RX_DATA_QUEUES; i++)
p += scnprintf(p, sizeof(buf)+buf-p,
"%08x %04x\n",
@@ -142,7 +133,7 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf,
key->u.tkip.rx[i].iv16);
len = p - buf;
break;
- case ALG_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP:
for (i = 0; i < NUM_RX_DATA_QUEUES + 1; i++) {
rpn = key->u.ccmp.rx_pn[i];
p += scnprintf(p, sizeof(buf)+buf-p,
@@ -152,7 +143,7 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf,
}
len = p - buf;
break;
- case ALG_AES_CMAC:
+ case WLAN_CIPHER_SUITE_AES_CMAC:
rpn = key->u.aes_cmac.rx_pn;
p += scnprintf(p, sizeof(buf)+buf-p,
"%02x%02x%02x%02x%02x%02x\n",
@@ -174,11 +165,11 @@ static ssize_t key_replays_read(struct file *file, char __user *userbuf,
char buf[20];
int len;
- switch (key->conf.alg) {
- case ALG_CCMP:
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_CCMP:
len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays);
break;
- case ALG_AES_CMAC:
+ case WLAN_CIPHER_SUITE_AES_CMAC:
len = scnprintf(buf, sizeof(buf), "%u\n",
key->u.aes_cmac.replays);
break;
@@ -196,8 +187,8 @@ static ssize_t key_icverrors_read(struct file *file, char __user *userbuf,
char buf[20];
int len;
- switch (key->conf.alg) {
- case ALG_AES_CMAC:
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_AES_CMAC:
len = scnprintf(buf, sizeof(buf), "%u\n",
key->u.aes_cmac.icverrors);
break;
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 20b2998fa0e..cbdf36d7841 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -121,6 +121,7 @@ static const struct file_operations name##_ops = { \
.read = ieee80211_if_read_##name, \
.write = (_write), \
.open = mac80211_open_file_generic, \
+ .llseek = generic_file_llseek, \
}
#define __IEEE80211_IF_FILE_W(name) \
@@ -409,6 +410,9 @@ void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata)
sprintf(buf, "netdev:%s", sdata->name);
sdata->debugfs.dir = debugfs_create_dir(buf,
sdata->local->hw.wiphy->debugfsdir);
+ if (sdata->debugfs.dir)
+ sdata->debugfs.subdir_stations = debugfs_create_dir("stations",
+ sdata->debugfs.dir);
add_files(sdata);
}
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 76839d4dfaa..4601fea1784 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -36,6 +36,7 @@ static ssize_t sta_ ##name## _read(struct file *file, \
static const struct file_operations sta_ ##name## _ops = { \
.read = sta_##name##_read, \
.open = mac80211_open_file_generic, \
+ .llseek = generic_file_llseek, \
}
#define STA_OPS_RW(name) \
@@ -43,6 +44,7 @@ static const struct file_operations sta_ ##name## _ops = { \
.read = sta_##name##_read, \
.write = sta_##name##_write, \
.open = mac80211_open_file_generic, \
+ .llseek = generic_file_llseek, \
}
#define STA_FILE(name, field, format) \
@@ -196,7 +198,8 @@ static ssize_t sta_agg_status_write(struct file *file, const char __user *userbu
else
ret = ieee80211_stop_tx_ba_session(&sta->sta, tid);
} else {
- __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT, 3);
+ __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT,
+ 3, true);
ret = 0;
}
@@ -300,7 +303,7 @@ STA_OPS(ht_capa);
void ieee80211_sta_debugfs_add(struct sta_info *sta)
{
- struct dentry *stations_dir = sta->local->debugfs.stations;
+ struct dentry *stations_dir = sta->sdata->debugfs.subdir_stations;
u8 mac[3*ETH_ALEN];
sta->debugfs.add_has_run = true;
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 14123dce544..16983825f8e 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -54,6 +54,20 @@ static inline int drv_add_interface(struct ieee80211_local *local,
return ret;
}
+static inline int drv_change_interface(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ enum nl80211_iftype type, bool p2p)
+{
+ int ret;
+
+ might_sleep();
+
+ trace_drv_change_interface(local, sdata, type, p2p);
+ ret = local->ops->change_interface(&local->hw, &sdata->vif, type, p2p);
+ trace_drv_return_int(local, ret);
+ return ret;
+}
+
static inline void drv_remove_interface(struct ieee80211_local *local,
struct ieee80211_vif *vif)
{
diff --git a/net/mac80211/driver-trace.h b/net/mac80211/driver-trace.h
index 5d5d2a97466..6831fb1641c 100644
--- a/net/mac80211/driver-trace.h
+++ b/net/mac80211/driver-trace.h
@@ -25,12 +25,14 @@ static inline void trace_ ## name(proto) {}
#define STA_PR_FMT " sta:%pM"
#define STA_PR_ARG __entry->sta_addr
-#define VIF_ENTRY __field(enum nl80211_iftype, vif_type) __field(void *, sdata) \
+#define VIF_ENTRY __field(enum nl80211_iftype, vif_type) __field(void *, sdata) \
+ __field(bool, p2p) \
__string(vif_name, sdata->dev ? sdata->dev->name : "<nodev>")
-#define VIF_ASSIGN __entry->vif_type = sdata->vif.type; __entry->sdata = sdata; \
+#define VIF_ASSIGN __entry->vif_type = sdata->vif.type; __entry->sdata = sdata; \
+ __entry->p2p = sdata->vif.p2p; \
__assign_str(vif_name, sdata->dev ? sdata->dev->name : "<nodev>")
-#define VIF_PR_FMT " vif:%s(%d)"
-#define VIF_PR_ARG __get_str(vif_name), __entry->vif_type
+#define VIF_PR_FMT " vif:%s(%d%s)"
+#define VIF_PR_ARG __get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : ""
/*
* Tracing for driver callbacks.
@@ -136,6 +138,34 @@ TRACE_EVENT(drv_add_interface,
)
);
+TRACE_EVENT(drv_change_interface,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ enum nl80211_iftype type, bool p2p),
+
+ TP_ARGS(local, sdata, type, p2p),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ __field(u32, new_type)
+ __field(bool, new_p2p)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ __entry->new_type = type;
+ __entry->new_p2p = p2p;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT " new type:%d%s",
+ LOCAL_PR_ARG, VIF_PR_ARG, __entry->new_type,
+ __entry->new_p2p ? "/p2p" : ""
+ )
+);
+
TRACE_EVENT(drv_remove_interface,
TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata),
@@ -336,7 +366,7 @@ TRACE_EVENT(drv_set_key,
LOCAL_ENTRY
VIF_ENTRY
STA_ENTRY
- __field(enum ieee80211_key_alg, alg)
+ __field(u32, cipher)
__field(u8, hw_key_idx)
__field(u8, flags)
__field(s8, keyidx)
@@ -346,7 +376,7 @@ TRACE_EVENT(drv_set_key,
LOCAL_ASSIGN;
VIF_ASSIGN;
STA_ASSIGN;
- __entry->alg = key->alg;
+ __entry->cipher = key->cipher;
__entry->flags = key->flags;
__entry->keyidx = key->keyidx;
__entry->hw_key_idx = key->hw_key_idx;
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 9d101fb3386..75d679d75e6 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -101,16 +101,16 @@ void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_supported_band *sband,
ht_cap->mcs.rx_mask[32/8] |= 1;
}
-void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta)
+void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, bool tx)
{
int i;
cancel_work_sync(&sta->ampdu_mlme.work);
for (i = 0; i < STA_TID_NUM; i++) {
- __ieee80211_stop_tx_ba_session(sta, i, WLAN_BACK_INITIATOR);
+ __ieee80211_stop_tx_ba_session(sta, i, WLAN_BACK_INITIATOR, tx);
__ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT,
- WLAN_REASON_QSTA_LEAVE_QBSS);
+ WLAN_REASON_QSTA_LEAVE_QBSS, tx);
}
}
@@ -135,7 +135,7 @@ void ieee80211_ba_session_work(struct work_struct *work)
if (test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired))
___ieee80211_stop_rx_ba_session(
sta, tid, WLAN_BACK_RECIPIENT,
- WLAN_REASON_QSTA_TIMEOUT);
+ WLAN_REASON_QSTA_TIMEOUT, true);
tid_tx = sta->ampdu_mlme.tid_tx[tid];
if (!tid_tx)
@@ -146,7 +146,8 @@ void ieee80211_ba_session_work(struct work_struct *work)
else if (test_and_clear_bit(HT_AGG_STATE_WANT_STOP,
&tid_tx->state))
___ieee80211_stop_tx_ba_session(sta, tid,
- WLAN_BACK_INITIATOR);
+ WLAN_BACK_INITIATOR,
+ true);
}
mutex_unlock(&sta->ampdu_mlme.mtx);
}
@@ -214,9 +215,11 @@ void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
#endif /* CONFIG_MAC80211_HT_DEBUG */
if (initiator == WLAN_BACK_INITIATOR)
- __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_INITIATOR, 0);
+ __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_INITIATOR, 0,
+ true);
else
- __ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_RECIPIENT);
+ __ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_RECIPIENT,
+ true);
}
int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
@@ -265,3 +268,33 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
return 0;
}
+
+void ieee80211_request_smps_work(struct work_struct *work)
+{
+ struct ieee80211_sub_if_data *sdata =
+ container_of(work, struct ieee80211_sub_if_data,
+ u.mgd.request_smps_work);
+
+ mutex_lock(&sdata->u.mgd.mtx);
+ __ieee80211_request_smps(sdata, sdata->u.mgd.driver_smps_mode);
+ mutex_unlock(&sdata->u.mgd.mtx);
+}
+
+void ieee80211_request_smps(struct ieee80211_vif *vif,
+ enum ieee80211_smps_mode smps_mode)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+ if (WARN_ON(vif->type != NL80211_IFTYPE_STATION))
+ return;
+
+ if (WARN_ON(smps_mode == IEEE80211_SMPS_OFF))
+ smps_mode = IEEE80211_SMPS_AUTOMATIC;
+
+ sdata->u.mgd.driver_smps_mode = smps_mode;
+
+ ieee80211_queue_work(&sdata->local->hw,
+ &sdata->u.mgd.request_smps_work);
+}
+/* this might change ... don't want non-open drivers using it */
+EXPORT_SYMBOL_GPL(ieee80211_request_smps);
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index c691780725a..ff60c022f51 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -173,6 +173,19 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
memcpy(skb_put(skb, ifibss->ie_len),
ifibss->ie, ifibss->ie_len);
+ if (local->hw.queues >= 4) {
+ pos = skb_put(skb, 9);
+ *pos++ = WLAN_EID_VENDOR_SPECIFIC;
+ *pos++ = 7; /* len */
+ *pos++ = 0x00; /* Microsoft OUI 00:50:F2 */
+ *pos++ = 0x50;
+ *pos++ = 0xf2;
+ *pos++ = 2; /* WME */
+ *pos++ = 0; /* WME info */
+ *pos++ = 1; /* WME ver */
+ *pos++ = 0; /* U-APSD no in use */
+ }
+
rcu_assign_pointer(ifibss->presp, skb);
sdata->vif.bss_conf.beacon_int = beacon_int;
@@ -266,37 +279,45 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
return;
- if (sdata->vif.type == NL80211_IFTYPE_ADHOC && elems->supp_rates &&
+ if (sdata->vif.type == NL80211_IFTYPE_ADHOC &&
memcmp(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0) {
- supp_rates = ieee80211_sta_get_rates(local, elems, band);
rcu_read_lock();
-
sta = sta_info_get(sdata, mgmt->sa);
- if (sta) {
- u32 prev_rates;
- prev_rates = sta->sta.supp_rates[band];
- /* make sure mandatory rates are always added */
- sta->sta.supp_rates[band] = supp_rates |
- ieee80211_mandatory_rates(local, band);
+ if (elems->supp_rates) {
+ supp_rates = ieee80211_sta_get_rates(local, elems,
+ band);
+ if (sta) {
+ u32 prev_rates;
+
+ prev_rates = sta->sta.supp_rates[band];
+ /* make sure mandatory rates are always added */
+ sta->sta.supp_rates[band] = supp_rates |
+ ieee80211_mandatory_rates(local, band);
- if (sta->sta.supp_rates[band] != prev_rates) {
+ if (sta->sta.supp_rates[band] != prev_rates) {
#ifdef CONFIG_MAC80211_IBSS_DEBUG
- printk(KERN_DEBUG "%s: updated supp_rates set "
- "for %pM based on beacon/probe_response "
- "(0x%x -> 0x%x)\n",
- sdata->name, sta->sta.addr,
- prev_rates, sta->sta.supp_rates[band]);
+ printk(KERN_DEBUG
+ "%s: updated supp_rates set "
+ "for %pM based on beacon"
+ "/probe_resp (0x%x -> 0x%x)\n",
+ sdata->name, sta->sta.addr,
+ prev_rates,
+ sta->sta.supp_rates[band]);
#endif
- rate_control_rate_init(sta);
- }
- rcu_read_unlock();
- } else {
- rcu_read_unlock();
- ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa,
- supp_rates, GFP_KERNEL);
+ rate_control_rate_init(sta);
+ }
+ } else
+ sta = ieee80211_ibss_add_sta(sdata, mgmt->bssid,
+ mgmt->sa, supp_rates,
+ GFP_ATOMIC);
}
+
+ if (sta && elems->wmm_info)
+ set_sta_flags(sta, WLAN_STA_WME);
+
+ rcu_read_unlock();
}
bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems,
@@ -427,8 +448,8 @@ struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
return NULL;
#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
- printk(KERN_DEBUG "%s: Adding new IBSS station %pM (dev=%s)\n",
- wiphy_name(local->hw.wiphy), addr, sdata->name);
+ wiphy_debug(local->hw.wiphy, "Adding new IBSS station %pM (dev=%s)\n",
+ addr, sdata->name);
#endif
sta = sta_info_alloc(sdata, addr, gfp);
@@ -920,12 +941,14 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
memcpy(sdata->u.ibss.ssid, params->ssid, IEEE80211_MAX_SSID_LEN);
sdata->u.ibss.ssid_len = params->ssid_len;
+ mutex_unlock(&sdata->u.ibss.mtx);
+
+ mutex_lock(&sdata->local->mtx);
ieee80211_recalc_idle(sdata->local);
+ mutex_unlock(&sdata->local->mtx);
ieee80211_queue_work(&sdata->local->hw, &sdata->work);
- mutex_unlock(&sdata->u.ibss.mtx);
-
return 0;
}
@@ -980,7 +1003,9 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata)
mutex_unlock(&sdata->u.ibss.mtx);
+ mutex_lock(&local->mtx);
ieee80211_recalc_idle(sdata->local);
+ mutex_unlock(&local->mtx);
return 0;
}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 65e0ed6c297..b80c3868992 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -50,12 +50,6 @@ struct ieee80211_local;
* increased memory use (about 2 kB of RAM per entry). */
#define IEEE80211_FRAGMENT_MAX 4
-/*
- * Time after which we ignore scan results and no longer report/use
- * them in any way.
- */
-#define IEEE80211_SCAN_RESULT_EXPIRE (10 * HZ)
-
#define TU_TO_EXP_TIME(x) (jiffies + usecs_to_jiffies((x) * 1024))
#define IEEE80211_DEFAULT_UAPSD_QUEUES \
@@ -165,12 +159,37 @@ typedef unsigned __bitwise__ ieee80211_rx_result;
#define RX_DROP_MONITOR ((__force ieee80211_rx_result) 2u)
#define RX_QUEUED ((__force ieee80211_rx_result) 3u)
-#define IEEE80211_RX_IN_SCAN BIT(0)
-/* frame is destined to interface currently processed (incl. multicast frames) */
-#define IEEE80211_RX_RA_MATCH BIT(1)
-#define IEEE80211_RX_AMSDU BIT(2)
-#define IEEE80211_RX_FRAGMENTED BIT(3)
-/* only add flags here that do not change with subframes of an aMPDU */
+/**
+ * enum ieee80211_packet_rx_flags - packet RX flags
+ * @IEEE80211_RX_RA_MATCH: frame is destined to interface currently processed
+ * (incl. multicast frames)
+ * @IEEE80211_RX_IN_SCAN: received while scanning
+ * @IEEE80211_RX_FRAGMENTED: fragmented frame
+ * @IEEE80211_RX_AMSDU: a-MSDU packet
+ * @IEEE80211_RX_MALFORMED_ACTION_FRM: action frame is malformed
+ *
+ * These are per-frame flags that are attached to a frame in the
+ * @rx_flags field of &struct ieee80211_rx_status.
+ */
+enum ieee80211_packet_rx_flags {
+ IEEE80211_RX_IN_SCAN = BIT(0),
+ IEEE80211_RX_RA_MATCH = BIT(1),
+ IEEE80211_RX_FRAGMENTED = BIT(2),
+ IEEE80211_RX_AMSDU = BIT(3),
+ IEEE80211_RX_MALFORMED_ACTION_FRM = BIT(4),
+};
+
+/**
+ * enum ieee80211_rx_flags - RX data flags
+ *
+ * @IEEE80211_RX_CMNTR: received on cooked monitor already
+ *
+ * These flags are used across handling multiple interfaces
+ * for a single frame.
+ */
+enum ieee80211_rx_flags {
+ IEEE80211_RX_CMNTR = BIT(0),
+};
struct ieee80211_rx_data {
struct sk_buff *skb;
@@ -343,10 +362,14 @@ struct ieee80211_if_managed {
unsigned long timers_running; /* used for quiesce/restart */
bool powersave; /* powersave requested for this iface */
enum ieee80211_smps_mode req_smps, /* requested smps mode */
- ap_smps; /* smps mode AP thinks we're in */
+ ap_smps, /* smps mode AP thinks we're in */
+ driver_smps_mode; /* smps mode request */
+
+ struct work_struct request_smps_work;
unsigned int flags;
+ bool beacon_crc_valid;
u32 beacon_crc;
enum {
@@ -371,6 +394,13 @@ struct ieee80211_if_managed {
int ave_beacon_signal;
/*
+ * Number of Beacon frames used in ave_beacon_signal. This can be used
+ * to avoid generating less reliable cqm events that would be based
+ * only on couple of received frames.
+ */
+ unsigned int count_beacon_signal;
+
+ /*
* Last Beacon frame signal strength average (ave_beacon_signal / 16)
* that triggered a cqm event. 0 indicates that no event has been
* generated for the current association.
@@ -474,6 +504,19 @@ enum ieee80211_sub_if_data_flags {
IEEE80211_SDATA_DONT_BRIDGE_PACKETS = BIT(3),
};
+/**
+ * enum ieee80211_sdata_state_bits - virtual interface state bits
+ * @SDATA_STATE_RUNNING: virtual interface is up & running; this
+ * mirrors netif_running() but is separate for interface type
+ * change handling while the interface is up
+ * @SDATA_STATE_OFFCHANNEL: This interface is currently in offchannel
+ * mode, so queues are stopped
+ */
+enum ieee80211_sdata_state_bits {
+ SDATA_STATE_RUNNING,
+ SDATA_STATE_OFFCHANNEL,
+};
+
struct ieee80211_sub_if_data {
struct list_head list;
@@ -487,6 +530,8 @@ struct ieee80211_sub_if_data {
unsigned int flags;
+ unsigned long state;
+
int drop_unencrypted;
char name[IFNAMSIZ];
@@ -497,17 +542,20 @@ struct ieee80211_sub_if_data {
*/
bool ht_opmode_valid;
+ /* to detect idle changes */
+ bool old_idle;
+
/* Fragment table for host-based reassembly */
struct ieee80211_fragment_entry fragments[IEEE80211_FRAGMENT_MAX];
unsigned int fragment_next;
-#define NUM_DEFAULT_KEYS 4
-#define NUM_DEFAULT_MGMT_KEYS 2
struct ieee80211_key *keys[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS];
struct ieee80211_key *default_key;
struct ieee80211_key *default_mgmt_key;
u16 sequence_number;
+ __be16 control_port_protocol;
+ bool control_port_no_encrypt;
struct work_struct work;
struct sk_buff_head skb_queue;
@@ -539,6 +587,7 @@ struct ieee80211_sub_if_data {
#ifdef CONFIG_MAC80211_DEBUGFS
struct {
struct dentry *dir;
+ struct dentry *subdir_stations;
struct dentry *default_key;
struct dentry *default_mgmt_key;
} debugfs;
@@ -595,11 +644,17 @@ enum queue_stop_reason {
* determine if we are on the operating channel or not
* @SCAN_OFF_CHANNEL: We're off our operating channel for scanning,
* gets only set in conjunction with SCAN_SW_SCANNING
+ * @SCAN_COMPLETED: Set for our scan work function when the driver reported
+ * that the scan completed.
+ * @SCAN_ABORTED: Set for our scan work function when the driver reported
+ * a scan complete for an aborted scan.
*/
enum {
SCAN_SW_SCANNING,
SCAN_HW_SCANNING,
SCAN_OFF_CHANNEL,
+ SCAN_COMPLETED,
+ SCAN_ABORTED,
};
/**
@@ -634,7 +689,6 @@ struct ieee80211_local {
/*
* work stuff, potentially off-channel (in the future)
*/
- struct mutex work_mtx;
struct list_head work_list;
struct timer_list work_timer;
struct work_struct work_work;
@@ -653,9 +707,13 @@ struct ieee80211_local {
int open_count;
int monitors, cooked_mntrs;
/* number of interfaces with corresponding FIF_ flags */
- int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll;
+ int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll,
+ fif_probe_req;
+ int probe_req_reg;
unsigned int filter_flags; /* FIF_* */
+ bool wiphy_ciphers_allocated;
+
/* protects the aggregated multicast list and filter calls */
spinlock_t filter_lock;
@@ -746,9 +804,10 @@ struct ieee80211_local {
*/
struct mutex key_mtx;
+ /* mutex for scan and work locking */
+ struct mutex mtx;
/* Scanning and BSS list */
- struct mutex scan_mtx;
unsigned long scanning;
struct cfg80211_ssid scan_ssid;
struct cfg80211_scan_request *int_scan_req;
@@ -866,10 +925,14 @@ struct ieee80211_local {
#ifdef CONFIG_MAC80211_DEBUGFS
struct local_debugfsdentries {
struct dentry *rcdir;
- struct dentry *stations;
struct dentry *keys;
} debugfs;
#endif
+
+ /* dummy netdev for use w/ NAPI */
+ struct net_device napi_dev;
+
+ struct napi_struct napi;
};
static inline struct ieee80211_sub_if_data *
@@ -1003,6 +1066,8 @@ void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb);
+void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata);
+void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata);
/* IBSS code */
void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local);
@@ -1068,10 +1133,12 @@ void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata);
void ieee80211_remove_interfaces(struct ieee80211_local *local);
u32 __ieee80211_recalc_idle(struct ieee80211_local *local);
void ieee80211_recalc_idle(struct ieee80211_local *local);
+void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
+ const int offset);
static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata)
{
- return netif_running(sdata->dev);
+ return test_bit(SDATA_STATE_RUNNING, &sdata->state);
}
/* tx handling */
@@ -1105,12 +1172,13 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
enum ieee80211_smps_mode smps, const u8 *da,
const u8 *bssid);
+void ieee80211_request_smps_work(struct work_struct *work);
void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
- u16 initiator, u16 reason);
+ u16 initiator, u16 reason, bool stop);
void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
- u16 initiator, u16 reason);
-void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta);
+ u16 initiator, u16 reason, bool stop);
+void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, bool tx);
void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta,
struct ieee80211_mgmt *mgmt, size_t len);
@@ -1124,13 +1192,16 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
size_t len);
int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
- enum ieee80211_back_parties initiator);
+ enum ieee80211_back_parties initiator,
+ bool tx);
int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
- enum ieee80211_back_parties initiator);
+ enum ieee80211_back_parties initiator,
+ bool tx);
void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid);
void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid);
void ieee80211_ba_session_work(struct work_struct *work);
void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid);
+void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid);
/* Spectrum management */
void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
@@ -1146,6 +1217,12 @@ int __ieee80211_suspend(struct ieee80211_hw *hw);
static inline int __ieee80211_resume(struct ieee80211_hw *hw)
{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ WARN(test_bit(SCAN_HW_SCANNING, &local->scanning),
+ "%s: resume with hardware scan still in progress\n",
+ wiphy_name(hw->wiphy));
+
return ieee80211_reconfig(hw_to_local(hw));
}
#else
@@ -1208,7 +1285,8 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
const u8 *key, u8 key_len, u8 key_idx);
int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
const u8 *ie, size_t ie_len,
- enum ieee80211_band band);
+ enum ieee80211_band band, u32 rate_mask,
+ u8 channel);
void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
const u8 *ssid, size_t ssid_len,
const u8 *ie, size_t ie_len);
@@ -1221,8 +1299,7 @@ u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
enum ieee80211_band band);
int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata,
enum ieee80211_smps_mode smps_mode);
-void ieee80211_recalc_smps(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *forsdata);
+void ieee80211_recalc_smps(struct ieee80211_local *local);
size_t ieee80211_ie_split(const u8 *ies, size_t ielen,
const u8 *ids, int n_ids, size_t offset);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index ebbe264e2b0..f9163b12c7f 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -24,6 +24,7 @@
#include "led.h"
#include "driver-ops.h"
#include "wme.h"
+#include "rate.h"
/**
* DOC: Interface list locking
@@ -94,21 +95,14 @@ static inline int identical_mac_addr_allowed(int type1, int type2)
type2 == NL80211_IFTYPE_AP_VLAN));
}
-static int ieee80211_open(struct net_device *dev)
+static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata,
+ enum nl80211_iftype iftype)
{
- struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
- struct ieee80211_sub_if_data *nsdata;
struct ieee80211_local *local = sdata->local;
- struct sta_info *sta;
- u32 changed = 0;
- int res;
- u32 hw_reconf_flags = 0;
- u8 null_addr[ETH_ALEN] = {0};
+ struct ieee80211_sub_if_data *nsdata;
+ struct net_device *dev = sdata->dev;
- /* fail early if user set an invalid address */
- if (compare_ether_addr(dev->dev_addr, null_addr) &&
- !is_valid_ether_addr(dev->dev_addr))
- return -EADDRNOTAVAIL;
+ ASSERT_RTNL();
/* we hold the RTNL here so can safely walk the list */
list_for_each_entry(nsdata, &local->interfaces, list) {
@@ -125,7 +119,7 @@ static int ieee80211_open(struct net_device *dev)
* belonging to the same hardware. Then, however, we're
* faced with having to adopt two different TSF timers...
*/
- if (sdata->vif.type == NL80211_IFTYPE_ADHOC &&
+ if (iftype == NL80211_IFTYPE_ADHOC &&
nsdata->vif.type == NL80211_IFTYPE_ADHOC)
return -EBUSY;
@@ -139,19 +133,56 @@ static int ieee80211_open(struct net_device *dev)
/*
* check whether it may have the same address
*/
- if (!identical_mac_addr_allowed(sdata->vif.type,
+ if (!identical_mac_addr_allowed(iftype,
nsdata->vif.type))
return -ENOTUNIQ;
/*
* can only add VLANs to enabled APs
*/
- if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
+ if (iftype == NL80211_IFTYPE_AP_VLAN &&
nsdata->vif.type == NL80211_IFTYPE_AP)
sdata->bss = &nsdata->u.ap;
}
}
+ return 0;
+}
+
+void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
+ const int offset)
+{
+ struct ieee80211_local *local = sdata->local;
+ u32 flags = sdata->u.mntr_flags;
+
+#define ADJUST(_f, _s) do { \
+ if (flags & MONITOR_FLAG_##_f) \
+ local->fif_##_s += offset; \
+ } while (0)
+
+ ADJUST(FCSFAIL, fcsfail);
+ ADJUST(PLCPFAIL, plcpfail);
+ ADJUST(CONTROL, control);
+ ADJUST(CONTROL, pspoll);
+ ADJUST(OTHER_BSS, other_bss);
+
+#undef ADJUST
+}
+
+/*
+ * NOTE: Be very careful when changing this function, it must NOT return
+ * an error on interface type changes that have been pre-checked, so most
+ * checks should be in ieee80211_check_concurrent_iface.
+ */
+static int ieee80211_do_open(struct net_device *dev, bool coming_up)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+ u32 changed = 0;
+ int res;
+ u32 hw_reconf_flags = 0;
+
switch (sdata->vif.type) {
case NL80211_IFTYPE_WDS:
if (!is_valid_ether_addr(sdata->u.wds.remote_addr))
@@ -177,7 +208,9 @@ static int ieee80211_open(struct net_device *dev)
/* no special treatment */
break;
case NL80211_IFTYPE_UNSPECIFIED:
- case __NL80211_IFTYPE_AFTER_LAST:
+ case NUM_NL80211_IFTYPES:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_P2P_GO:
/* cannot happen */
WARN_ON(1);
break;
@@ -187,39 +220,30 @@ static int ieee80211_open(struct net_device *dev)
res = drv_start(local);
if (res)
goto err_del_bss;
+ if (local->ops->napi_poll)
+ napi_enable(&local->napi);
/* we're brought up, everything changes */
hw_reconf_flags = ~0;
ieee80211_led_radio(local, true);
}
/*
- * Check all interfaces and copy the hopefully now-present
- * MAC address to those that have the special null one.
+ * Copy the hopefully now-present MAC address to
+ * this interface, if it has the special null one.
*/
- list_for_each_entry(nsdata, &local->interfaces, list) {
- struct net_device *ndev = nsdata->dev;
-
- /*
- * No need to check running since we do not allow
- * it to start up with this invalid address.
- */
- if (compare_ether_addr(null_addr, ndev->dev_addr) == 0) {
- memcpy(ndev->dev_addr,
- local->hw.wiphy->perm_addr,
- ETH_ALEN);
- memcpy(ndev->perm_addr, ndev->dev_addr, ETH_ALEN);
+ if (is_zero_ether_addr(dev->dev_addr)) {
+ memcpy(dev->dev_addr,
+ local->hw.wiphy->perm_addr,
+ ETH_ALEN);
+ memcpy(dev->perm_addr, dev->dev_addr, ETH_ALEN);
+
+ if (!is_valid_ether_addr(dev->dev_addr)) {
+ if (!local->open_count)
+ drv_stop(local);
+ return -EADDRNOTAVAIL;
}
}
- /*
- * Validate the MAC address for this device.
- */
- if (!is_valid_ether_addr(dev->dev_addr)) {
- if (!local->open_count)
- drv_stop(local);
- return -EADDRNOTAVAIL;
- }
-
switch (sdata->vif.type) {
case NL80211_IFTYPE_AP_VLAN:
/* no need to tell driver */
@@ -237,25 +261,17 @@ static int ieee80211_open(struct net_device *dev)
hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR;
}
- if (sdata->u.mntr_flags & MONITOR_FLAG_FCSFAIL)
- local->fif_fcsfail++;
- if (sdata->u.mntr_flags & MONITOR_FLAG_PLCPFAIL)
- local->fif_plcpfail++;
- if (sdata->u.mntr_flags & MONITOR_FLAG_CONTROL) {
- local->fif_control++;
- local->fif_pspoll++;
- }
- if (sdata->u.mntr_flags & MONITOR_FLAG_OTHER_BSS)
- local->fif_other_bss++;
-
+ ieee80211_adjust_monitor_flags(sdata, 1);
ieee80211_configure_filter(local);
netif_carrier_on(dev);
break;
default:
- res = drv_add_interface(local, &sdata->vif);
- if (res)
- goto err_stop;
+ if (coming_up) {
+ res = drv_add_interface(local, &sdata->vif);
+ if (res)
+ goto err_stop;
+ }
if (ieee80211_vif_is_mesh(&sdata->vif)) {
local->fif_other_bss++;
@@ -264,8 +280,11 @@ static int ieee80211_open(struct net_device *dev)
ieee80211_start_mesh(sdata);
} else if (sdata->vif.type == NL80211_IFTYPE_AP) {
local->fif_pspoll++;
+ local->fif_probe_req++;
ieee80211_configure_filter(local);
+ } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ local->fif_probe_req++;
}
changed |= ieee80211_reset_erp_info(sdata);
@@ -277,6 +296,8 @@ static int ieee80211_open(struct net_device *dev)
netif_carrier_on(dev);
}
+ set_bit(SDATA_STATE_RUNNING, &sdata->state);
+
if (sdata->vif.type == NL80211_IFTYPE_WDS) {
/* Create STA entry for the WDS peer */
sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr,
@@ -294,6 +315,8 @@ static int ieee80211_open(struct net_device *dev)
/* STA has been freed */
goto err_del_interface;
}
+
+ rate_control_rate_init(sta);
}
/*
@@ -307,9 +330,13 @@ static int ieee80211_open(struct net_device *dev)
if (sdata->flags & IEEE80211_SDATA_PROMISC)
atomic_inc(&local->iff_promiscs);
+ mutex_lock(&local->mtx);
hw_reconf_flags |= __ieee80211_recalc_idle(local);
+ mutex_unlock(&local->mtx);
+
+ if (coming_up)
+ local->open_count++;
- local->open_count++;
if (hw_reconf_flags) {
ieee80211_hw_config(local, hw_reconf_flags);
/*
@@ -334,22 +361,42 @@ static int ieee80211_open(struct net_device *dev)
sdata->bss = NULL;
if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
list_del(&sdata->u.vlan.list);
+ clear_bit(SDATA_STATE_RUNNING, &sdata->state);
return res;
}
-static int ieee80211_stop(struct net_device *dev)
+static int ieee80211_open(struct net_device *dev)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ int err;
+
+ /* fail early if user set an invalid address */
+ if (!is_zero_ether_addr(dev->dev_addr) &&
+ !is_valid_ether_addr(dev->dev_addr))
+ return -EADDRNOTAVAIL;
+
+ err = ieee80211_check_concurrent_iface(sdata, sdata->vif.type);
+ if (err)
+ return err;
+
+ return ieee80211_do_open(dev, true);
+}
+
+static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
+ bool going_down)
+{
struct ieee80211_local *local = sdata->local;
unsigned long flags;
struct sk_buff *skb, *tmp;
u32 hw_reconf_flags = 0;
int i;
+ clear_bit(SDATA_STATE_RUNNING, &sdata->state);
+
/*
* Stop TX on this interface first.
*/
- netif_tx_stop_all_queues(dev);
+ netif_tx_stop_all_queues(sdata->dev);
/*
* Purge work for this interface.
@@ -366,12 +413,9 @@ static int ieee80211_stop(struct net_device *dev)
* (because if we remove a STA after ops->remove_interface()
* the driver will have removed the vif info already!)
*
- * We could relax this and only unlink the stations from the
- * hash table and list but keep them on a per-sdata list that
- * will be inserted back again when the interface is brought
- * up again, but I don't currently see a use case for that,
- * except with WDS which gets a STA entry created when it is
- * brought up.
+ * This is relevant only in AP, WDS and mesh modes, since in
+ * all other modes we've already removed all stations when
+ * disconnecting etc.
*/
sta_info_flush(local, sdata);
@@ -387,14 +431,19 @@ static int ieee80211_stop(struct net_device *dev)
if (sdata->flags & IEEE80211_SDATA_PROMISC)
atomic_dec(&local->iff_promiscs);
- if (sdata->vif.type == NL80211_IFTYPE_AP)
+ if (sdata->vif.type == NL80211_IFTYPE_AP) {
local->fif_pspoll--;
+ local->fif_probe_req--;
+ } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+ local->fif_probe_req--;
+ }
- netif_addr_lock_bh(dev);
+ netif_addr_lock_bh(sdata->dev);
spin_lock_bh(&local->filter_lock);
- __hw_addr_unsync(&local->mc_list, &dev->mc, dev->addr_len);
+ __hw_addr_unsync(&local->mc_list, &sdata->dev->mc,
+ sdata->dev->addr_len);
spin_unlock_bh(&local->filter_lock);
- netif_addr_unlock_bh(dev);
+ netif_addr_unlock_bh(sdata->dev);
ieee80211_configure_filter(local);
@@ -406,11 +455,21 @@ static int ieee80211_stop(struct net_device *dev)
struct ieee80211_sub_if_data *vlan, *tmpsdata;
struct beacon_data *old_beacon = sdata->u.ap.beacon;
+ /* sdata_running will return false, so this will disable */
+ ieee80211_bss_info_change_notify(sdata,
+ BSS_CHANGED_BEACON_ENABLED);
+
/* remove beacon */
rcu_assign_pointer(sdata->u.ap.beacon, NULL);
synchronize_rcu();
kfree(old_beacon);
+ /* free all potentially still buffered bcast frames */
+ while ((skb = skb_dequeue(&sdata->u.ap.ps_bc_buf))) {
+ local->total_ps_buffered--;
+ dev_kfree_skb(skb);
+ }
+
/* down all dependent devices, that is VLANs */
list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans,
u.vlan.list)
@@ -418,7 +477,8 @@ static int ieee80211_stop(struct net_device *dev)
WARN_ON(!list_empty(&sdata->u.ap.vlans));
}
- local->open_count--;
+ if (going_down)
+ local->open_count--;
switch (sdata->vif.type) {
case NL80211_IFTYPE_AP_VLAN:
@@ -437,40 +497,9 @@ static int ieee80211_stop(struct net_device *dev)
hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR;
}
- if (sdata->u.mntr_flags & MONITOR_FLAG_FCSFAIL)
- local->fif_fcsfail--;
- if (sdata->u.mntr_flags & MONITOR_FLAG_PLCPFAIL)
- local->fif_plcpfail--;
- if (sdata->u.mntr_flags & MONITOR_FLAG_CONTROL) {
- local->fif_pspoll--;
- local->fif_control--;
- }
- if (sdata->u.mntr_flags & MONITOR_FLAG_OTHER_BSS)
- local->fif_other_bss--;
-
+ ieee80211_adjust_monitor_flags(sdata, -1);
ieee80211_configure_filter(local);
break;
- case NL80211_IFTYPE_STATION:
- del_timer_sync(&sdata->u.mgd.chswitch_timer);
- del_timer_sync(&sdata->u.mgd.timer);
- del_timer_sync(&sdata->u.mgd.conn_mon_timer);
- del_timer_sync(&sdata->u.mgd.bcn_mon_timer);
- /*
- * If any of the timers fired while we waited for it, it will
- * have queued its work. Now the work will be running again
- * but will not rearm the timer again because it checks
- * whether the interface is running, which, at this point,
- * it no longer is.
- */
- cancel_work_sync(&sdata->u.mgd.chswitch_work);
- cancel_work_sync(&sdata->u.mgd.monitor_work);
- cancel_work_sync(&sdata->u.mgd.beacon_connection_loss_work);
-
- /* fall through */
- case NL80211_IFTYPE_ADHOC:
- if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
- del_timer_sync(&sdata->u.ibss.timer);
- /* fall through */
case NL80211_IFTYPE_MESH_POINT:
if (ieee80211_vif_is_mesh(&sdata->vif)) {
/* other_bss and allmulti are always set on mesh
@@ -498,27 +527,34 @@ static int ieee80211_stop(struct net_device *dev)
ieee80211_scan_cancel(local);
/*
- * Disable beaconing for AP and mesh, IBSS can't
- * still be joined to a network at this point.
+ * Disable beaconing here for mesh only, AP and IBSS
+ * are already taken care of.
*/
- if (sdata->vif.type == NL80211_IFTYPE_AP ||
- sdata->vif.type == NL80211_IFTYPE_MESH_POINT) {
+ if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
ieee80211_bss_info_change_notify(sdata,
BSS_CHANGED_BEACON_ENABLED);
- }
- /* free all remaining keys, there shouldn't be any */
+ /*
+ * Free all remaining keys, there shouldn't be any,
+ * except maybe group keys in AP more or WDS?
+ */
ieee80211_free_keys(sdata);
- drv_remove_interface(local, &sdata->vif);
+
+ if (going_down)
+ drv_remove_interface(local, &sdata->vif);
}
sdata->bss = NULL;
+ mutex_lock(&local->mtx);
hw_reconf_flags |= __ieee80211_recalc_idle(local);
+ mutex_unlock(&local->mtx);
ieee80211_recalc_ps(local, -1);
if (local->open_count == 0) {
+ if (local->ops->napi_poll)
+ napi_disable(&local->napi);
ieee80211_clear_tx_pending(local);
ieee80211_stop_device(local);
@@ -541,6 +577,13 @@ static int ieee80211_stop(struct net_device *dev)
}
}
spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+}
+
+static int ieee80211_stop(struct net_device *dev)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ ieee80211_do_stop(sdata, true);
return 0;
}
@@ -585,8 +628,6 @@ static void ieee80211_teardown_sdata(struct net_device *dev)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
struct ieee80211_local *local = sdata->local;
- struct beacon_data *beacon;
- struct sk_buff *skb;
int flushed;
int i;
@@ -599,37 +640,8 @@ static void ieee80211_teardown_sdata(struct net_device *dev)
__skb_queue_purge(&sdata->fragments[i].skb_list);
sdata->fragment_next = 0;
- switch (sdata->vif.type) {
- case NL80211_IFTYPE_AP:
- beacon = sdata->u.ap.beacon;
- rcu_assign_pointer(sdata->u.ap.beacon, NULL);
- synchronize_rcu();
- kfree(beacon);
-
- while ((skb = skb_dequeue(&sdata->u.ap.ps_bc_buf))) {
- local->total_ps_buffered--;
- dev_kfree_skb(skb);
- }
-
- break;
- case NL80211_IFTYPE_MESH_POINT:
- if (ieee80211_vif_is_mesh(&sdata->vif))
- mesh_rmc_free(sdata);
- break;
- case NL80211_IFTYPE_ADHOC:
- if (WARN_ON(sdata->u.ibss.presp))
- kfree_skb(sdata->u.ibss.presp);
- break;
- case NL80211_IFTYPE_STATION:
- case NL80211_IFTYPE_WDS:
- case NL80211_IFTYPE_AP_VLAN:
- case NL80211_IFTYPE_MONITOR:
- break;
- case NL80211_IFTYPE_UNSPECIFIED:
- case __NL80211_IFTYPE_AFTER_LAST:
- BUG();
- break;
- }
+ if (ieee80211_vif_is_mesh(&sdata->vif))
+ mesh_rmc_free(sdata);
flushed = sta_info_flush(local, sdata);
WARN_ON(flushed);
@@ -791,7 +803,8 @@ static void ieee80211_iface_work(struct work_struct *work)
__ieee80211_stop_rx_ba_session(
sta, tid, WLAN_BACK_RECIPIENT,
- WLAN_REASON_QSTA_REQUIRE_SETUP);
+ WLAN_REASON_QSTA_REQUIRE_SETUP,
+ true);
}
mutex_unlock(&local->sta_mtx);
} else switch (sdata->vif.type) {
@@ -844,9 +857,13 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
/* and set some type-dependent values */
sdata->vif.type = type;
+ sdata->vif.p2p = false;
sdata->dev->netdev_ops = &ieee80211_dataif_ops;
sdata->wdev.iftype = type;
+ sdata->control_port_protocol = cpu_to_be16(ETH_P_PAE);
+ sdata->control_port_no_encrypt = false;
+
/* only monitor differs */
sdata->dev->type = ARPHRD_ETHER;
@@ -854,10 +871,20 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
INIT_WORK(&sdata->work, ieee80211_iface_work);
switch (type) {
+ case NL80211_IFTYPE_P2P_GO:
+ type = NL80211_IFTYPE_AP;
+ sdata->vif.type = type;
+ sdata->vif.p2p = true;
+ /* fall through */
case NL80211_IFTYPE_AP:
skb_queue_head_init(&sdata->u.ap.ps_bc_buf);
INIT_LIST_HEAD(&sdata->u.ap.vlans);
break;
+ case NL80211_IFTYPE_P2P_CLIENT:
+ type = NL80211_IFTYPE_STATION;
+ sdata->vif.type = type;
+ sdata->vif.p2p = true;
+ /* fall through */
case NL80211_IFTYPE_STATION:
ieee80211_sta_setup_sdata(sdata);
break;
@@ -878,7 +905,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
case NL80211_IFTYPE_AP_VLAN:
break;
case NL80211_IFTYPE_UNSPECIFIED:
- case __NL80211_IFTYPE_AFTER_LAST:
+ case NUM_NL80211_IFTYPES:
BUG();
break;
}
@@ -886,12 +913,85 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
ieee80211_debugfs_add_netdev(sdata);
}
+static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata,
+ enum nl80211_iftype type)
+{
+ struct ieee80211_local *local = sdata->local;
+ int ret, err;
+ enum nl80211_iftype internal_type = type;
+ bool p2p = false;
+
+ ASSERT_RTNL();
+
+ if (!local->ops->change_interface)
+ return -EBUSY;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_ADHOC:
+ /*
+ * Could maybe also all others here?
+ * Just not sure how that interacts
+ * with the RX/config path e.g. for
+ * mesh.
+ */
+ break;
+ default:
+ return -EBUSY;
+ }
+
+ switch (type) {
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_ADHOC:
+ /*
+ * Could probably support everything
+ * but WDS here (WDS do_open can fail
+ * under memory pressure, which this
+ * code isn't prepared to handle).
+ */
+ break;
+ case NL80211_IFTYPE_P2P_CLIENT:
+ p2p = true;
+ internal_type = NL80211_IFTYPE_STATION;
+ break;
+ case NL80211_IFTYPE_P2P_GO:
+ p2p = true;
+ internal_type = NL80211_IFTYPE_AP;
+ break;
+ default:
+ return -EBUSY;
+ }
+
+ ret = ieee80211_check_concurrent_iface(sdata, internal_type);
+ if (ret)
+ return ret;
+
+ ieee80211_do_stop(sdata, false);
+
+ ieee80211_teardown_sdata(sdata->dev);
+
+ ret = drv_change_interface(local, sdata, internal_type, p2p);
+ if (ret)
+ type = sdata->vif.type;
+
+ ieee80211_setup_sdata(sdata, type);
+
+ err = ieee80211_do_open(sdata->dev, false);
+ WARN(err, "type change: do_open returned %d", err);
+
+ return ret;
+}
+
int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata,
enum nl80211_iftype type)
{
+ int ret;
+
ASSERT_RTNL();
- if (type == sdata->vif.type)
+ if (type == ieee80211_vif_type_p2p(&sdata->vif))
return 0;
/* Setting ad-hoc mode on non-IBSS channel is not supported. */
@@ -899,18 +999,15 @@ int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata,
type == NL80211_IFTYPE_ADHOC)
return -EOPNOTSUPP;
- /*
- * We could, here, on changes between IBSS/STA/MESH modes,
- * invoke an MLME function instead that disassociates etc.
- * and goes into the requested mode.
- */
-
- if (ieee80211_sdata_running(sdata))
- return -EBUSY;
-
- /* Purge and reset type-dependent state. */
- ieee80211_teardown_sdata(sdata->dev);
- ieee80211_setup_sdata(sdata, type);
+ if (ieee80211_sdata_running(sdata)) {
+ ret = ieee80211_runtime_change_iftype(sdata, type);
+ if (ret)
+ return ret;
+ } else {
+ /* Purge and reset type-dependent state. */
+ ieee80211_teardown_sdata(sdata->dev);
+ ieee80211_setup_sdata(sdata, type);
+ }
/* reset some values that shouldn't be kept across type changes */
sdata->vif.bss_conf.basic_rates =
@@ -1167,8 +1264,7 @@ static u32 ieee80211_idle_off(struct ieee80211_local *local,
return 0;
#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
- printk(KERN_DEBUG "%s: device no longer idle - %s\n",
- wiphy_name(local->hw.wiphy), reason);
+ wiphy_debug(local->hw.wiphy, "device no longer idle - %s\n", reason);
#endif
local->hw.conf.flags &= ~IEEE80211_CONF_IDLE;
@@ -1181,8 +1277,7 @@ static u32 ieee80211_idle_on(struct ieee80211_local *local)
return 0;
#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
- printk(KERN_DEBUG "%s: device now idle\n",
- wiphy_name(local->hw.wiphy));
+ wiphy_debug(local->hw.wiphy, "device now idle\n");
#endif
drv_flush(local, false);
@@ -1195,28 +1290,61 @@ u32 __ieee80211_recalc_idle(struct ieee80211_local *local)
{
struct ieee80211_sub_if_data *sdata;
int count = 0;
+ bool working = false, scanning = false;
+ struct ieee80211_work *wk;
- if (!list_empty(&local->work_list))
- return ieee80211_idle_off(local, "working");
-
- if (local->scanning)
- return ieee80211_idle_off(local, "scanning");
+#ifdef CONFIG_PROVE_LOCKING
+ WARN_ON(debug_locks && !lockdep_rtnl_is_held() &&
+ !lockdep_is_held(&local->iflist_mtx));
+#endif
+ lockdep_assert_held(&local->mtx);
list_for_each_entry(sdata, &local->interfaces, list) {
- if (!ieee80211_sdata_running(sdata))
+ if (!ieee80211_sdata_running(sdata)) {
+ sdata->vif.bss_conf.idle = true;
continue;
+ }
+
+ sdata->old_idle = sdata->vif.bss_conf.idle;
+
/* do not count disabled managed interfaces */
if (sdata->vif.type == NL80211_IFTYPE_STATION &&
- !sdata->u.mgd.associated)
+ !sdata->u.mgd.associated) {
+ sdata->vif.bss_conf.idle = true;
continue;
+ }
/* do not count unused IBSS interfaces */
if (sdata->vif.type == NL80211_IFTYPE_ADHOC &&
- !sdata->u.ibss.ssid_len)
+ !sdata->u.ibss.ssid_len) {
+ sdata->vif.bss_conf.idle = true;
continue;
+ }
/* count everything else */
count++;
}
+ list_for_each_entry(wk, &local->work_list, list) {
+ working = true;
+ wk->sdata->vif.bss_conf.idle = false;
+ }
+
+ if (local->scan_sdata) {
+ scanning = true;
+ local->scan_sdata->vif.bss_conf.idle = false;
+ }
+
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (sdata->old_idle == sdata->vif.bss_conf.idle)
+ continue;
+ if (!ieee80211_sdata_running(sdata))
+ continue;
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IDLE);
+ }
+
+ if (working)
+ return ieee80211_idle_off(local, "working");
+ if (scanning)
+ return ieee80211_idle_off(local, "scanning");
if (!count)
return ieee80211_idle_on(local);
else
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 1b9d87ed143..ccd676b2f59 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -49,7 +49,7 @@ static const u8 bcast_addr[ETH_ALEN] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
static void assert_key_lock(struct ieee80211_local *local)
{
- WARN_ON(!mutex_is_locked(&local->key_mtx));
+ lockdep_assert_held(&local->key_mtx);
}
static struct ieee80211_sta *get_sta_for_key(struct ieee80211_key *key)
@@ -60,7 +60,7 @@ static struct ieee80211_sta *get_sta_for_key(struct ieee80211_key *key)
return NULL;
}
-static void ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
+static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
{
struct ieee80211_sub_if_data *sdata;
struct ieee80211_sta *sta;
@@ -69,12 +69,20 @@ static void ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
might_sleep();
if (!key->local->ops->set_key)
- return;
+ goto out_unsupported;
assert_key_lock(key->local);
sta = get_sta_for_key(key);
+ /*
+ * If this is a per-STA GTK, check if it
+ * is supported; if not, return.
+ */
+ if (sta && !(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE) &&
+ !(key->local->hw.flags & IEEE80211_HW_SUPPORTS_PER_STA_GTK))
+ goto out_unsupported;
+
sdata = key->sdata;
if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
sdata = container_of(sdata->bss,
@@ -83,14 +91,28 @@ static void ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
ret = drv_set_key(key->local, SET_KEY, sdata, sta, &key->conf);
- if (!ret)
+ if (!ret) {
key->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE;
+ return 0;
+ }
- if (ret && ret != -ENOSPC && ret != -EOPNOTSUPP)
- printk(KERN_ERR "mac80211-%s: failed to set key "
- "(%d, %pM) to hardware (%d)\n",
- wiphy_name(key->local->hw.wiphy),
- key->conf.keyidx, sta ? sta->addr : bcast_addr, ret);
+ if (ret != -ENOSPC && ret != -EOPNOTSUPP)
+ wiphy_err(key->local->hw.wiphy,
+ "failed to set key (%d, %pM) to hardware (%d)\n",
+ key->conf.keyidx, sta ? sta->addr : bcast_addr, ret);
+
+ out_unsupported:
+ switch (key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
+ case WLAN_CIPHER_SUITE_TKIP:
+ case WLAN_CIPHER_SUITE_CCMP:
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ /* all of these we can do in software */
+ return 0;
+ default:
+ return -EINVAL;
+ }
}
static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key)
@@ -121,14 +143,33 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key)
sta, &key->conf);
if (ret)
- printk(KERN_ERR "mac80211-%s: failed to remove key "
- "(%d, %pM) from hardware (%d)\n",
- wiphy_name(key->local->hw.wiphy),
- key->conf.keyidx, sta ? sta->addr : bcast_addr, ret);
+ wiphy_err(key->local->hw.wiphy,
+ "failed to remove key (%d, %pM) from hardware (%d)\n",
+ key->conf.keyidx, sta ? sta->addr : bcast_addr, ret);
key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
}
+void ieee80211_key_removed(struct ieee80211_key_conf *key_conf)
+{
+ struct ieee80211_key *key;
+
+ key = container_of(key_conf, struct ieee80211_key, conf);
+
+ might_sleep();
+ assert_key_lock(key->local);
+
+ key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
+
+ /*
+ * Flush TX path to avoid attempts to use this key
+ * after this function returns. Until then, drivers
+ * must be prepared to handle the key.
+ */
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(ieee80211_key_removed);
+
static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata,
int idx)
{
@@ -184,6 +225,7 @@ void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata,
static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta,
+ bool pairwise,
struct ieee80211_key *old,
struct ieee80211_key *new)
{
@@ -192,8 +234,14 @@ static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
if (new)
list_add(&new->list, &sdata->key_list);
- if (sta) {
- rcu_assign_pointer(sta->key, new);
+ if (sta && pairwise) {
+ rcu_assign_pointer(sta->ptk, new);
+ } else if (sta) {
+ if (old)
+ idx = old->conf.keyidx;
+ else
+ idx = new->conf.keyidx;
+ rcu_assign_pointer(sta->gtk[idx], new);
} else {
WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx);
@@ -227,20 +275,18 @@ static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
}
}
-struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
- int idx,
- size_t key_len,
+struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
const u8 *key_data,
size_t seq_len, const u8 *seq)
{
struct ieee80211_key *key;
- int i, j;
+ int i, j, err;
BUG_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS);
key = kzalloc(sizeof(struct ieee80211_key) + key_len, GFP_KERNEL);
if (!key)
- return NULL;
+ return ERR_PTR(-ENOMEM);
/*
* Default to software encryption; we'll later upload the
@@ -249,15 +295,16 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
key->conf.flags = 0;
key->flags = 0;
- key->conf.alg = alg;
+ key->conf.cipher = cipher;
key->conf.keyidx = idx;
key->conf.keylen = key_len;
- switch (alg) {
- case ALG_WEP:
+ switch (cipher) {
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
key->conf.iv_len = WEP_IV_LEN;
key->conf.icv_len = WEP_ICV_LEN;
break;
- case ALG_TKIP:
+ case WLAN_CIPHER_SUITE_TKIP:
key->conf.iv_len = TKIP_IV_LEN;
key->conf.icv_len = TKIP_ICV_LEN;
if (seq) {
@@ -269,7 +316,7 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
}
}
break;
- case ALG_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP:
key->conf.iv_len = CCMP_HDR_LEN;
key->conf.icv_len = CCMP_MIC_LEN;
if (seq) {
@@ -278,42 +325,38 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
key->u.ccmp.rx_pn[i][j] =
seq[CCMP_PN_LEN - j - 1];
}
- break;
- case ALG_AES_CMAC:
- key->conf.iv_len = 0;
- key->conf.icv_len = sizeof(struct ieee80211_mmie);
- if (seq)
- for (j = 0; j < 6; j++)
- key->u.aes_cmac.rx_pn[j] = seq[6 - j - 1];
- break;
- }
- memcpy(key->conf.key, key_data, key_len);
- INIT_LIST_HEAD(&key->list);
-
- if (alg == ALG_CCMP) {
/*
* Initialize AES key state here as an optimization so that
* it does not need to be initialized for every packet.
*/
key->u.ccmp.tfm = ieee80211_aes_key_setup_encrypt(key_data);
- if (!key->u.ccmp.tfm) {
+ if (IS_ERR(key->u.ccmp.tfm)) {
+ err = PTR_ERR(key->u.ccmp.tfm);
kfree(key);
- return NULL;
+ key = ERR_PTR(err);
}
- }
-
- if (alg == ALG_AES_CMAC) {
+ break;
+ case WLAN_CIPHER_SUITE_AES_CMAC:
+ key->conf.iv_len = 0;
+ key->conf.icv_len = sizeof(struct ieee80211_mmie);
+ if (seq)
+ for (j = 0; j < 6; j++)
+ key->u.aes_cmac.rx_pn[j] = seq[6 - j - 1];
/*
* Initialize AES key state here as an optimization so that
* it does not need to be initialized for every packet.
*/
key->u.aes_cmac.tfm =
ieee80211_aes_cmac_key_setup(key_data);
- if (!key->u.aes_cmac.tfm) {
+ if (IS_ERR(key->u.aes_cmac.tfm)) {
+ err = PTR_ERR(key->u.aes_cmac.tfm);
kfree(key);
- return NULL;
+ key = ERR_PTR(err);
}
+ break;
}
+ memcpy(key->conf.key, key_data, key_len);
+ INIT_LIST_HEAD(&key->list);
return key;
}
@@ -326,9 +369,9 @@ static void __ieee80211_key_destroy(struct ieee80211_key *key)
if (key->local)
ieee80211_key_disable_hw_accel(key);
- if (key->conf.alg == ALG_CCMP)
+ if (key->conf.cipher == WLAN_CIPHER_SUITE_CCMP)
ieee80211_aes_key_free(key->u.ccmp.tfm);
- if (key->conf.alg == ALG_AES_CMAC)
+ if (key->conf.cipher == WLAN_CIPHER_SUITE_AES_CMAC)
ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm);
if (key->local)
ieee80211_debugfs_key_remove(key);
@@ -336,12 +379,13 @@ static void __ieee80211_key_destroy(struct ieee80211_key *key)
kfree(key);
}
-void ieee80211_key_link(struct ieee80211_key *key,
- struct ieee80211_sub_if_data *sdata,
- struct sta_info *sta)
+int ieee80211_key_link(struct ieee80211_key *key,
+ struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta)
{
struct ieee80211_key *old_key;
- int idx;
+ int idx, ret;
+ bool pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE;
BUG_ON(!sdata);
BUG_ON(!key);
@@ -358,13 +402,6 @@ void ieee80211_key_link(struct ieee80211_key *key,
*/
if (test_sta_flags(sta, WLAN_STA_WME))
key->conf.flags |= IEEE80211_KEY_FLAG_WMM_STA;
-
- /*
- * This key is for a specific sta interface,
- * inform the driver that it should try to store
- * this key as pairwise key.
- */
- key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE;
} else {
if (sdata->vif.type == NL80211_IFTYPE_STATION) {
struct sta_info *ap;
@@ -386,19 +423,23 @@ void ieee80211_key_link(struct ieee80211_key *key,
mutex_lock(&sdata->local->key_mtx);
- if (sta)
- old_key = sta->key;
+ if (sta && pairwise)
+ old_key = sta->ptk;
+ else if (sta)
+ old_key = sta->gtk[idx];
else
old_key = sdata->keys[idx];
- __ieee80211_key_replace(sdata, sta, old_key, key);
+ __ieee80211_key_replace(sdata, sta, pairwise, old_key, key);
__ieee80211_key_destroy(old_key);
ieee80211_debugfs_key_add(key);
- ieee80211_key_enable_hw_accel(key);
+ ret = ieee80211_key_enable_hw_accel(key);
mutex_unlock(&sdata->local->key_mtx);
+
+ return ret;
}
static void __ieee80211_key_free(struct ieee80211_key *key)
@@ -408,7 +449,8 @@ static void __ieee80211_key_free(struct ieee80211_key *key)
*/
if (key->sdata)
__ieee80211_key_replace(key->sdata, key->sta,
- key, NULL);
+ key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE,
+ key, NULL);
__ieee80211_key_destroy(key);
}
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
index b665bbb7a47..0db1c0f5f69 100644
--- a/net/mac80211/key.h
+++ b/net/mac80211/key.h
@@ -16,6 +16,9 @@
#include <linux/rcupdate.h>
#include <net/mac80211.h>
+#define NUM_DEFAULT_KEYS 4
+#define NUM_DEFAULT_MGMT_KEYS 2
+
#define WEP_IV_LEN 4
#define WEP_ICV_LEN 4
#define ALG_TKIP_KEY_LEN 32
@@ -123,18 +126,16 @@ struct ieee80211_key {
struct ieee80211_key_conf conf;
};
-struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
- int idx,
- size_t key_len,
+struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
const u8 *key_data,
size_t seq_len, const u8 *seq);
/*
* Insert a key into data structures (sdata, sta if necessary)
* to make it used, free old key.
*/
-void ieee80211_key_link(struct ieee80211_key *key,
- struct ieee80211_sub_if_data *sdata,
- struct sta_info *sta);
+int __must_check ieee80211_key_link(struct ieee80211_key *key,
+ struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta);
void ieee80211_key_free(struct ieee80211_local *local,
struct ieee80211_key *key);
void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx);
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 798a91b100c..22bc42b1899 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -54,6 +54,9 @@ void ieee80211_configure_filter(struct ieee80211_local *local)
if (local->monitors || local->scanning)
new_flags |= FIF_BCN_PRBRESP_PROMISC;
+ if (local->fif_probe_req || local->probe_req_reg)
+ new_flags |= FIF_PROBE_REQ;
+
if (local->fif_fcsfail)
new_flags |= FIF_FCSFAIL;
@@ -99,16 +102,19 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed)
int ret = 0;
int power;
enum nl80211_channel_type channel_type;
+ u32 offchannel_flag;
might_sleep();
scan_chan = local->scan_channel;
+ offchannel_flag = local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL;
if (scan_chan) {
chan = scan_chan;
channel_type = NL80211_CHAN_NO_HT;
local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL;
- } else if (local->tmp_channel) {
+ } else if (local->tmp_channel &&
+ local->oper_channel != local->tmp_channel) {
chan = scan_chan = local->tmp_channel;
channel_type = local->tmp_channel_type;
local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL;
@@ -117,8 +123,9 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed)
channel_type = local->_oper_channel_type;
local->hw.conf.flags &= ~IEEE80211_CONF_OFFCHANNEL;
}
+ offchannel_flag ^= local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL;
- if (chan != local->hw.conf.channel ||
+ if (offchannel_flag || chan != local->hw.conf.channel ||
channel_type != local->hw.conf.channel_type) {
local->hw.conf.channel = chan;
local->hw.conf.channel_type = channel_type;
@@ -197,6 +204,8 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
sdata->vif.bss_conf.bssid = sdata->u.ibss.bssid;
else if (sdata->vif.type == NL80211_IFTYPE_AP)
sdata->vif.bss_conf.bssid = sdata->vif.addr;
+ else if (sdata->vif.type == NL80211_IFTYPE_WDS)
+ sdata->vif.bss_conf.bssid = NULL;
else if (ieee80211_vif_is_mesh(&sdata->vif)) {
sdata->vif.bss_conf.bssid = zero;
} else {
@@ -207,6 +216,7 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
switch (sdata->vif.type) {
case NL80211_IFTYPE_AP:
case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_MESH_POINT:
break;
default:
@@ -291,7 +301,16 @@ static void ieee80211_restart_work(struct work_struct *work)
struct ieee80211_local *local =
container_of(work, struct ieee80211_local, restart_work);
+ /* wait for scan work complete */
+ flush_workqueue(local->workqueue);
+
+ mutex_lock(&local->mtx);
+ WARN(test_bit(SCAN_HW_SCANNING, &local->scanning),
+ "%s called with hardware scan in progress\n", __func__);
+ mutex_unlock(&local->mtx);
+
rtnl_lock();
+ ieee80211_scan_cancel(local);
ieee80211_reconfig(local);
rtnl_unlock();
}
@@ -302,7 +321,7 @@ void ieee80211_restart_hw(struct ieee80211_hw *hw)
trace_api_restart_hw(local);
- /* use this reason, __ieee80211_resume will unblock it */
+ /* use this reason, ieee80211_reconfig will unblock it */
ieee80211_stop_queues_by_reason(hw,
IEEE80211_QUEUE_STOP_REASON_SUSPEND);
@@ -316,7 +335,7 @@ static void ieee80211_recalc_smps_work(struct work_struct *work)
container_of(work, struct ieee80211_local, recalc_smps);
mutex_lock(&local->iflist_mtx);
- ieee80211_recalc_smps(local, NULL);
+ ieee80211_recalc_smps(local);
mutex_unlock(&local->iflist_mtx);
}
@@ -336,9 +355,6 @@ static int ieee80211_ifa_changed(struct notifier_block *nb,
struct ieee80211_if_managed *ifmgd;
int c = 0;
- if (!netif_running(ndev))
- return NOTIFY_DONE;
-
/* Make sure it's our interface that got changed */
if (!wdev)
return NOTIFY_DONE;
@@ -349,11 +365,14 @@ static int ieee80211_ifa_changed(struct notifier_block *nb,
sdata = IEEE80211_DEV_TO_SUB_IF(ndev);
bss_conf = &sdata->vif.bss_conf;
+ if (!ieee80211_sdata_running(sdata))
+ return NOTIFY_DONE;
+
/* ARP filtering is only supported in managed mode */
if (sdata->vif.type != NL80211_IFTYPE_STATION)
return NOTIFY_DONE;
- idev = sdata->dev->ip_ptr;
+ idev = __in_dev_get_rtnl(sdata->dev);
if (!idev)
return NOTIFY_DONE;
@@ -390,6 +409,80 @@ static int ieee80211_ifa_changed(struct notifier_block *nb,
}
#endif
+static int ieee80211_napi_poll(struct napi_struct *napi, int budget)
+{
+ struct ieee80211_local *local =
+ container_of(napi, struct ieee80211_local, napi);
+
+ return local->ops->napi_poll(&local->hw, budget);
+}
+
+void ieee80211_napi_schedule(struct ieee80211_hw *hw)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ napi_schedule(&local->napi);
+}
+EXPORT_SYMBOL(ieee80211_napi_schedule);
+
+void ieee80211_napi_complete(struct ieee80211_hw *hw)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ napi_complete(&local->napi);
+}
+EXPORT_SYMBOL(ieee80211_napi_complete);
+
+/* There isn't a lot of sense in it, but you can transmit anything you like */
+static const struct ieee80211_txrx_stypes
+ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = {
+ [NL80211_IFTYPE_ADHOC] = {
+ .tx = 0xffff,
+ .rx = BIT(IEEE80211_STYPE_ACTION >> 4),
+ },
+ [NL80211_IFTYPE_STATION] = {
+ .tx = 0xffff,
+ .rx = BIT(IEEE80211_STYPE_ACTION >> 4) |
+ BIT(IEEE80211_STYPE_PROBE_REQ >> 4),
+ },
+ [NL80211_IFTYPE_AP] = {
+ .tx = 0xffff,
+ .rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) |
+ BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) |
+ BIT(IEEE80211_STYPE_PROBE_REQ >> 4) |
+ BIT(IEEE80211_STYPE_DISASSOC >> 4) |
+ BIT(IEEE80211_STYPE_AUTH >> 4) |
+ BIT(IEEE80211_STYPE_DEAUTH >> 4) |
+ BIT(IEEE80211_STYPE_ACTION >> 4),
+ },
+ [NL80211_IFTYPE_AP_VLAN] = {
+ /* copy AP */
+ .tx = 0xffff,
+ .rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) |
+ BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) |
+ BIT(IEEE80211_STYPE_PROBE_REQ >> 4) |
+ BIT(IEEE80211_STYPE_DISASSOC >> 4) |
+ BIT(IEEE80211_STYPE_AUTH >> 4) |
+ BIT(IEEE80211_STYPE_DEAUTH >> 4) |
+ BIT(IEEE80211_STYPE_ACTION >> 4),
+ },
+ [NL80211_IFTYPE_P2P_CLIENT] = {
+ .tx = 0xffff,
+ .rx = BIT(IEEE80211_STYPE_ACTION >> 4) |
+ BIT(IEEE80211_STYPE_PROBE_REQ >> 4),
+ },
+ [NL80211_IFTYPE_P2P_GO] = {
+ .tx = 0xffff,
+ .rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) |
+ BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) |
+ BIT(IEEE80211_STYPE_PROBE_REQ >> 4) |
+ BIT(IEEE80211_STYPE_DISASSOC >> 4) |
+ BIT(IEEE80211_STYPE_AUTH >> 4) |
+ BIT(IEEE80211_STYPE_DEAUTH >> 4) |
+ BIT(IEEE80211_STYPE_ACTION >> 4),
+ },
+};
+
struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
const struct ieee80211_ops *ops)
{
@@ -419,6 +512,8 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
if (!wiphy)
return NULL;
+ wiphy->mgmt_stypes = ieee80211_default_mgmt_stypes;
+
wiphy->flags |= WIPHY_FLAG_NETNS_OK |
WIPHY_FLAG_4ADDR_AP |
WIPHY_FLAG_4ADDR_STATION;
@@ -444,6 +539,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
/* set up some defaults */
local->hw.queues = 1;
local->hw.max_rates = 1;
+ local->hw.max_report_rates = 0;
local->hw.conf.long_frame_max_tx_count = wiphy->retry_long;
local->hw.conf.short_frame_max_tx_count = wiphy->retry_short;
local->user_power_level = -1;
@@ -455,7 +551,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
__hw_addr_init(&local->mc_list);
mutex_init(&local->iflist_mtx);
- mutex_init(&local->scan_mtx);
+ mutex_init(&local->mtx);
mutex_init(&local->key_mtx);
spin_lock_init(&local->filter_lock);
@@ -494,6 +590,9 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
skb_queue_head_init(&local->skb_queue);
skb_queue_head_init(&local->skb_queue_unreliable);
+ /* init dummy netdev for use w/ NAPI */
+ init_dummy_netdev(&local->napi_dev);
+
return local_to_hw(local);
}
EXPORT_SYMBOL(ieee80211_alloc_hw);
@@ -506,6 +605,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
int channels, max_bitrates;
bool supp_ht;
static const u32 cipher_suites[] = {
+ /* keep WEP first, it may be removed below */
WLAN_CIPHER_SUITE_WEP40,
WLAN_CIPHER_SUITE_WEP104,
WLAN_CIPHER_SUITE_TKIP,
@@ -515,6 +615,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
WLAN_CIPHER_SUITE_AES_CMAC
};
+ if (hw->max_report_rates == 0)
+ hw->max_report_rates = hw->max_rates;
+
/*
* generic code guarantees at least one band,
* set this very early because much code assumes
@@ -554,6 +657,14 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
/* mac80211 always supports monitor */
local->hw.wiphy->interface_modes |= BIT(NL80211_IFTYPE_MONITOR);
+#ifndef CONFIG_MAC80211_MESH
+ /* mesh depends on Kconfig, but drivers should set it if they want */
+ local->hw.wiphy->interface_modes &= ~BIT(NL80211_IFTYPE_MESH_POINT);
+#endif
+
+ /* mac80211 supports control port protocol changing */
+ local->hw.wiphy->flags |= WIPHY_FLAG_CONTROL_PORT_PROTOCOL;
+
if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM;
else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
@@ -589,10 +700,41 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
if (local->hw.wiphy->max_scan_ie_len)
local->hw.wiphy->max_scan_ie_len -= local->scan_ies_len;
- local->hw.wiphy->cipher_suites = cipher_suites;
- local->hw.wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites);
- if (!(local->hw.flags & IEEE80211_HW_MFP_CAPABLE))
- local->hw.wiphy->n_cipher_suites--;
+ /* Set up cipher suites unless driver already did */
+ if (!local->hw.wiphy->cipher_suites) {
+ local->hw.wiphy->cipher_suites = cipher_suites;
+ local->hw.wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites);
+ if (!(local->hw.flags & IEEE80211_HW_MFP_CAPABLE))
+ local->hw.wiphy->n_cipher_suites--;
+ }
+ if (IS_ERR(local->wep_tx_tfm) || IS_ERR(local->wep_rx_tfm)) {
+ if (local->hw.wiphy->cipher_suites == cipher_suites) {
+ local->hw.wiphy->cipher_suites += 2;
+ local->hw.wiphy->n_cipher_suites -= 2;
+ } else {
+ u32 *suites;
+ int r, w = 0;
+
+ /* Filter out WEP */
+
+ suites = kmemdup(
+ local->hw.wiphy->cipher_suites,
+ sizeof(u32) * local->hw.wiphy->n_cipher_suites,
+ GFP_KERNEL);
+ if (!suites)
+ return -ENOMEM;
+ for (r = 0; r < local->hw.wiphy->n_cipher_suites; r++) {
+ u32 suite = local->hw.wiphy->cipher_suites[r];
+ if (suite == WLAN_CIPHER_SUITE_WEP40 ||
+ suite == WLAN_CIPHER_SUITE_WEP104)
+ continue;
+ suites[w++] = suite;
+ }
+ local->hw.wiphy->cipher_suites = suites;
+ local->hw.wiphy->n_cipher_suites = w;
+ local->wiphy_ciphers_allocated = true;
+ }
+ }
result = wiphy_register(local->hw.wiphy);
if (result < 0)
@@ -641,16 +783,16 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
result = ieee80211_wep_init(local);
if (result < 0)
- printk(KERN_DEBUG "%s: Failed to initialize wep: %d\n",
- wiphy_name(local->hw.wiphy), result);
+ wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n",
+ result);
rtnl_lock();
result = ieee80211_init_rate_ctrl_alg(local,
hw->rate_control_algorithm);
if (result < 0) {
- printk(KERN_DEBUG "%s: Failed to initialize rate control "
- "algorithm\n", wiphy_name(local->hw.wiphy));
+ wiphy_debug(local->hw.wiphy,
+ "Failed to initialize rate control algorithm\n");
goto fail_rate;
}
@@ -659,8 +801,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
result = ieee80211_if_add(local, "wlan%d", NULL,
NL80211_IFTYPE_STATION, NULL);
if (result)
- printk(KERN_WARNING "%s: Failed to add default virtual iface\n",
- wiphy_name(local->hw.wiphy));
+ wiphy_warn(local->hw.wiphy,
+ "Failed to add default virtual iface\n");
}
rtnl_unlock();
@@ -683,6 +825,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
goto fail_ifa;
#endif
+ netif_napi_add(&local->napi_dev, &local->napi, ieee80211_napi_poll,
+ local->hw.napi_weight);
+
return 0;
#ifdef CONFIG_INET
@@ -703,6 +848,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
fail_workqueue:
wiphy_unregister(local->hw.wiphy);
fail_wiphy_register:
+ if (local->wiphy_ciphers_allocated)
+ kfree(local->hw.wiphy->cipher_suites);
kfree(local->int_scan_req);
return result;
}
@@ -732,6 +879,13 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)
rtnl_unlock();
+ /*
+ * Now all work items will be gone, but the
+ * timer might still be armed, so delete it
+ */
+ del_timer_sync(&local->work_timer);
+
+ cancel_work_sync(&local->restart_work);
cancel_work_sync(&local->reconfig_filter);
ieee80211_clear_tx_pending(local);
@@ -740,8 +894,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)
if (skb_queue_len(&local->skb_queue) ||
skb_queue_len(&local->skb_queue_unreliable))
- printk(KERN_WARNING "%s: skb_queue not empty\n",
- wiphy_name(local->hw.wiphy));
+ wiphy_warn(local->hw.wiphy, "skb_queue not empty\n");
skb_queue_purge(&local->skb_queue);
skb_queue_purge(&local->skb_queue_unreliable);
@@ -758,7 +911,10 @@ void ieee80211_free_hw(struct ieee80211_hw *hw)
struct ieee80211_local *local = hw_to_local(hw);
mutex_destroy(&local->iflist_mtx);
- mutex_destroy(&local->scan_mtx);
+ mutex_destroy(&local->mtx);
+
+ if (local->wiphy_ciphers_allocated)
+ kfree(local->hw.wiphy->cipher_suites);
wiphy_free(local->hw.wiphy);
}
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index ea13a80a476..1c91f0f3c30 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -412,7 +412,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
enum plink_event event;
enum plink_frame_type ftype;
size_t baselen;
- bool deactivated;
+ bool deactivated, matches_local = true;
u8 ie_len;
u8 *baseaddr;
__le16 plid, llid, reason;
@@ -487,6 +487,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
/* Now we will figure out the appropriate event... */
event = PLINK_UNDEFINED;
if (ftype != PLINK_CLOSE && (!mesh_matches_local(&elems, sdata))) {
+ matches_local = false;
switch (ftype) {
case PLINK_OPEN:
event = OPN_RJCT;
@@ -498,7 +499,15 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
/* avoid warning */
break;
}
- spin_lock_bh(&sta->lock);
+ }
+
+ if (!sta && !matches_local) {
+ rcu_read_unlock();
+ reason = cpu_to_le16(MESH_CAPABILITY_POLICY_VIOLATION);
+ llid = 0;
+ mesh_plink_frame_tx(sdata, PLINK_CLOSE, mgmt->sa, llid,
+ plid, reason);
+ return;
} else if (!sta) {
/* ftype == PLINK_OPEN */
u32 rates;
@@ -522,7 +531,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
}
event = OPN_ACPT;
spin_lock_bh(&sta->lock);
- } else {
+ } else if (matches_local) {
spin_lock_bh(&sta->lock);
switch (ftype) {
case PLINK_OPEN:
@@ -564,6 +573,8 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
rcu_read_unlock();
return;
}
+ } else {
+ spin_lock_bh(&sta->lock);
}
mpl_dbg("Mesh plink (peer, state, llid, plid, event): %pM %s %d %d %d\n",
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index b6c163ac22d..a3a9421555a 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -54,6 +54,12 @@
*/
#define IEEE80211_SIGNAL_AVE_WEIGHT 3
+/*
+ * How many Beacon frames need to have been used in average signal strength
+ * before starting to indicate signal change events.
+ */
+#define IEEE80211_SIGNAL_AVE_MIN_COUNT 4
+
#define TMR_RUNNING_TIMER 0
#define TMR_RUNNING_CHANSW 1
@@ -86,7 +92,7 @@ enum rx_mgmt_action {
/* utils */
static inline void ASSERT_MGD_MTX(struct ieee80211_if_managed *ifmgd)
{
- WARN_ON(!mutex_is_locked(&ifmgd->mtx));
+ lockdep_assert_held(&ifmgd->mtx);
}
/*
@@ -109,7 +115,7 @@ static void run_again(struct ieee80211_if_managed *ifmgd,
mod_timer(&ifmgd->timer, timeout);
}
-static void mod_beacon_timer(struct ieee80211_sub_if_data *sdata)
+void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata)
{
if (sdata->local->hw.flags & IEEE80211_HW_BEACON_FILTER)
return;
@@ -118,6 +124,19 @@ static void mod_beacon_timer(struct ieee80211_sub_if_data *sdata)
round_jiffies_up(jiffies + IEEE80211_BEACON_LOSS_TIME));
}
+void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+ if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR)
+ return;
+
+ mod_timer(&sdata->u.mgd.conn_mon_timer,
+ round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME));
+
+ ifmgd->probe_send_count = 0;
+}
+
static int ecw2cw(int ecw)
{
return (1 << ecw) - 1;
@@ -778,16 +797,17 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
params.uapsd = uapsd;
#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
- printk(KERN_DEBUG "%s: WMM queue=%d aci=%d acm=%d aifs=%d "
- "cWmin=%d cWmax=%d txop=%d uapsd=%d\n",
- wiphy_name(local->hw.wiphy), queue, aci, acm,
- params.aifs, params.cw_min, params.cw_max, params.txop,
- params.uapsd);
+ wiphy_debug(local->hw.wiphy,
+ "WMM queue=%d aci=%d acm=%d aifs=%d "
+ "cWmin=%d cWmax=%d txop=%d uapsd=%d\n",
+ queue, aci, acm,
+ params.aifs, params.cw_min, params.cw_max,
+ params.txop, params.uapsd);
#endif
if (drv_conf_tx(local, queue, &params))
- printk(KERN_DEBUG "%s: failed to set TX queue "
- "parameters for queue %d\n",
- wiphy_name(local->hw.wiphy), queue);
+ wiphy_debug(local->hw.wiphy,
+ "failed to set TX queue parameters for queue %d\n",
+ queue);
}
/* enable WMM or activate new settings */
@@ -860,14 +880,6 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
sdata->u.mgd.flags &= ~(IEEE80211_STA_CONNECTION_POLL |
IEEE80211_STA_BEACON_POLL);
- /*
- * Always handle WMM once after association regardless
- * of the first value the AP uses. Setting -1 here has
- * that effect because the AP values is an unsigned
- * 4-bit value.
- */
- sdata->u.mgd.wmm_last_param_set = -1;
-
ieee80211_led_assoc(local, 1);
if (local->hw.flags & IEEE80211_HW_NEED_DTIM_PERIOD)
@@ -901,7 +913,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
mutex_lock(&local->iflist_mtx);
ieee80211_recalc_ps(local, -1);
- ieee80211_recalc_smps(local, sdata);
+ ieee80211_recalc_smps(local);
mutex_unlock(&local->iflist_mtx);
netif_tx_start_all_queues(sdata->dev);
@@ -909,7 +921,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
}
static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
- bool remove_sta)
+ bool remove_sta, bool tx)
{
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_local *local = sdata->local;
@@ -948,7 +960,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
sta = sta_info_get(sdata, bssid);
if (sta) {
set_sta_flags(sta, WLAN_STA_BLOCK_BA);
- ieee80211_sta_tear_down_BA_sessions(sta);
+ ieee80211_sta_tear_down_BA_sessions(sta, tx);
}
mutex_unlock(&local->sta_mtx);
@@ -990,6 +1002,11 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
if (remove_sta)
sta_info_destroy_addr(sdata, bssid);
+
+ del_timer_sync(&sdata->u.mgd.conn_mon_timer);
+ del_timer_sync(&sdata->u.mgd.bcn_mon_timer);
+ del_timer_sync(&sdata->u.mgd.timer);
+ del_timer_sync(&sdata->u.mgd.chswitch_timer);
}
void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata,
@@ -1006,21 +1023,26 @@ void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata,
if (is_multicast_ether_addr(hdr->addr1))
return;
- if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR)
- return;
-
- mod_timer(&sdata->u.mgd.conn_mon_timer,
- round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME));
+ ieee80211_sta_reset_conn_monitor(sdata);
}
static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
const u8 *ssid;
+ u8 *dst = ifmgd->associated->bssid;
+ u8 unicast_limit = max(1, IEEE80211_MAX_PROBE_TRIES - 3);
+
+ /*
+ * Try sending broadcast probe requests for the last three
+ * probe requests after the first ones failed since some
+ * buggy APs only support broadcast probe requests.
+ */
+ if (ifmgd->probe_send_count >= unicast_limit)
+ dst = NULL;
ssid = ieee80211_bss_get_ie(ifmgd->associated, WLAN_EID_SSID);
- ieee80211_send_probe_req(sdata, ifmgd->associated->bssid,
- ssid + 2, ssid[1], NULL, 0);
+ ieee80211_send_probe_req(sdata, dst, ssid + 2, ssid[1], NULL, 0);
ifmgd->probe_send_count++;
ifmgd->probe_timeout = jiffies + IEEE80211_PROBE_WAIT;
@@ -1102,9 +1124,12 @@ static void __ieee80211_connection_loss(struct ieee80211_sub_if_data *sdata)
printk(KERN_DEBUG "Connection to AP %pM lost.\n", bssid);
- ieee80211_set_disassoc(sdata, true);
- ieee80211_recalc_idle(local);
+ ieee80211_set_disassoc(sdata, true, true);
mutex_unlock(&ifmgd->mtx);
+
+ mutex_lock(&local->mtx);
+ ieee80211_recalc_idle(local);
+ mutex_unlock(&local->mtx);
/*
* must be outside lock due to cfg80211,
* but that's not a problem.
@@ -1172,8 +1197,10 @@ ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
printk(KERN_DEBUG "%s: deauthenticated from %pM (Reason: %u)\n",
sdata->name, bssid, reason_code);
- ieee80211_set_disassoc(sdata, true);
+ ieee80211_set_disassoc(sdata, true, false);
+ mutex_lock(&sdata->local->mtx);
ieee80211_recalc_idle(sdata->local);
+ mutex_unlock(&sdata->local->mtx);
return RX_MGMT_CFG80211_DEAUTH;
}
@@ -1202,8 +1229,10 @@ ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata,
printk(KERN_DEBUG "%s: disassociated from %pM (Reason: %u)\n",
sdata->name, mgmt->sa, reason_code);
- ieee80211_set_disassoc(sdata, true);
+ ieee80211_set_disassoc(sdata, true, false);
+ mutex_lock(&sdata->local->mtx);
ieee80211_recalc_idle(sdata->local);
+ mutex_unlock(&sdata->local->mtx);
return RX_MGMT_CFG80211_DISASSOC;
}
@@ -1262,7 +1291,7 @@ static bool ieee80211_assoc_success(struct ieee80211_work *wk,
rates = 0;
basic_rates = 0;
- sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+ sband = local->hw.wiphy->bands[wk->chan->band];
for (i = 0; i < elems.supp_rates_len; i++) {
int rate = (elems.supp_rates[i] & 0x7f) * 5;
@@ -1298,11 +1327,11 @@ static bool ieee80211_assoc_success(struct ieee80211_work *wk,
}
}
- sta->sta.supp_rates[local->hw.conf.channel->band] = rates;
+ sta->sta.supp_rates[wk->chan->band] = rates;
sdata->vif.bss_conf.basic_rates = basic_rates;
/* cf. IEEE 802.11 9.2.12 */
- if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ &&
+ if (wk->chan->band == IEEE80211_BAND_2GHZ &&
have_higher_than_11mbit)
sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
else
@@ -1330,6 +1359,14 @@ static bool ieee80211_assoc_success(struct ieee80211_work *wk,
return false;
}
+ /*
+ * Always handle WMM once after association regardless
+ * of the first value the AP uses. Setting -1 here has
+ * that effect because the AP values is an unsigned
+ * 4-bit value.
+ */
+ ifmgd->wmm_last_param_set = -1;
+
if (elems.wmm_param)
ieee80211_sta_wmm_params(local, sdata, elems.wmm_param,
elems.wmm_param_len);
@@ -1362,7 +1399,7 @@ static bool ieee80211_assoc_success(struct ieee80211_work *wk,
* Also start the timer that will detect beacon loss.
*/
ieee80211_sta_rx_notify(sdata, (struct ieee80211_hdr *)mgmt);
- mod_beacon_timer(sdata);
+ ieee80211_sta_reset_beacon_monitor(sdata);
return true;
}
@@ -1465,7 +1502,7 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
* we have or will be receiving any beacons or data, so let's
* schedule the timers again, just in case.
*/
- mod_beacon_timer(sdata);
+ ieee80211_sta_reset_beacon_monitor(sdata);
mod_timer(&ifmgd->conn_mon_timer,
round_jiffies_up(jiffies +
@@ -1540,15 +1577,18 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
ifmgd->last_beacon_signal = rx_status->signal;
if (ifmgd->flags & IEEE80211_STA_RESET_SIGNAL_AVE) {
ifmgd->flags &= ~IEEE80211_STA_RESET_SIGNAL_AVE;
- ifmgd->ave_beacon_signal = rx_status->signal;
+ ifmgd->ave_beacon_signal = rx_status->signal * 16;
ifmgd->last_cqm_event_signal = 0;
+ ifmgd->count_beacon_signal = 1;
} else {
ifmgd->ave_beacon_signal =
(IEEE80211_SIGNAL_AVE_WEIGHT * rx_status->signal * 16 +
(16 - IEEE80211_SIGNAL_AVE_WEIGHT) *
ifmgd->ave_beacon_signal) / 16;
+ ifmgd->count_beacon_signal++;
}
if (bss_conf->cqm_rssi_thold &&
+ ifmgd->count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT &&
!(local->hw.flags & IEEE80211_HW_SUPPORTS_CQM_RSSI)) {
int sig = ifmgd->ave_beacon_signal / 16;
int last_event = ifmgd->last_cqm_event_signal;
@@ -1588,7 +1628,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
* Push the beacon loss detection into the future since
* we are processing a beacon from the AP just now.
*/
- mod_beacon_timer(sdata);
+ ieee80211_sta_reset_beacon_monitor(sdata);
ncrc = crc32_be(0, (void *)&mgmt->u.beacon.beacon_int, 4);
ncrc = ieee802_11_parse_elems_crc(mgmt->u.beacon.variable,
@@ -1599,7 +1639,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
directed_tim = ieee80211_check_tim(elems.tim, elems.tim_len,
ifmgd->aid);
- if (ncrc != ifmgd->beacon_crc) {
+ if (ncrc != ifmgd->beacon_crc || !ifmgd->beacon_crc_valid) {
ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems,
true);
@@ -1630,9 +1670,10 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
}
}
- if (ncrc == ifmgd->beacon_crc)
+ if (ncrc == ifmgd->beacon_crc && ifmgd->beacon_crc_valid)
return;
ifmgd->beacon_crc = ncrc;
+ ifmgd->beacon_crc_valid = true;
if (elems.erp_info && elems.erp_info_len >= 1) {
erp_valid = true;
@@ -1751,7 +1792,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
struct ieee80211_local *local = sdata->local;
struct ieee80211_work *wk;
- mutex_lock(&local->work_mtx);
+ mutex_lock(&local->mtx);
list_for_each_entry(wk, &local->work_list, list) {
if (wk->sdata != sdata)
continue;
@@ -1783,7 +1824,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
free_work(wk);
break;
}
- mutex_unlock(&local->work_mtx);
+ mutex_unlock(&local->mtx);
cfg80211_send_deauth(sdata->dev, (u8 *)mgmt, skb->len);
}
@@ -1823,10 +1864,12 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
else if (ifmgd->probe_send_count < IEEE80211_MAX_PROBE_TRIES) {
#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
- printk(KERN_DEBUG "No probe response from AP %pM"
- " after %dms, try %d\n", bssid,
- (1000 * IEEE80211_PROBE_WAIT)/HZ,
- ifmgd->probe_send_count);
+ wiphy_debug(local->hw.wiphy,
+ "%s: No probe response from AP %pM"
+ " after %dms, try %d\n",
+ sdata->name,
+ bssid, (1000 * IEEE80211_PROBE_WAIT)/HZ,
+ ifmgd->probe_send_count);
#endif
ieee80211_mgd_probe_ap_send(sdata);
} else {
@@ -1836,12 +1879,16 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
*/
ifmgd->flags &= ~(IEEE80211_STA_CONNECTION_POLL |
IEEE80211_STA_BEACON_POLL);
- printk(KERN_DEBUG "No probe response from AP %pM"
- " after %dms, disconnecting.\n",
- bssid, (1000 * IEEE80211_PROBE_WAIT)/HZ);
- ieee80211_set_disassoc(sdata, true);
- ieee80211_recalc_idle(local);
+ wiphy_debug(local->hw.wiphy,
+ "%s: No probe response from AP %pM"
+ " after %dms, disconnecting.\n",
+ sdata->name,
+ bssid, (1000 * IEEE80211_PROBE_WAIT)/HZ);
+ ieee80211_set_disassoc(sdata, true, true);
mutex_unlock(&ifmgd->mtx);
+ mutex_lock(&local->mtx);
+ ieee80211_recalc_idle(local);
+ mutex_unlock(&local->mtx);
/*
* must be outside lock due to cfg80211,
* but that's not a problem.
@@ -1917,6 +1964,8 @@ void ieee80211_sta_quiesce(struct ieee80211_sub_if_data *sdata)
* time -- the code here is properly synchronised.
*/
+ cancel_work_sync(&ifmgd->request_smps_work);
+
cancel_work_sync(&ifmgd->beacon_connection_loss_work);
if (del_timer_sync(&ifmgd->timer))
set_bit(TMR_RUNNING_TIMER, &ifmgd->timers_running);
@@ -1952,6 +2001,7 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata)
INIT_WORK(&ifmgd->chswitch_work, ieee80211_chswitch_work);
INIT_WORK(&ifmgd->beacon_connection_loss_work,
ieee80211_beacon_connection_loss_work);
+ INIT_WORK(&ifmgd->request_smps_work, ieee80211_request_smps_work);
setup_timer(&ifmgd->timer, ieee80211_sta_timer,
(unsigned long) sdata);
setup_timer(&ifmgd->bcn_mon_timer, ieee80211_sta_bcn_mon_timer,
@@ -2158,7 +2208,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
}
/* Trying to reassociate - clear previous association state */
- ieee80211_set_disassoc(sdata, true);
+ ieee80211_set_disassoc(sdata, true, false);
}
mutex_unlock(&ifmgd->mtx);
@@ -2169,6 +2219,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
ifmgd->flags &= ~IEEE80211_STA_DISABLE_11N;
ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED;
+ ifmgd->beacon_crc_valid = false;
+
for (i = 0; i < req->crypto.n_ciphers_pairwise; i++)
if (req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP40 ||
req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_TKIP ||
@@ -2249,6 +2301,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
else
ifmgd->flags &= ~IEEE80211_STA_CONTROL_PORT;
+ sdata->control_port_protocol = req->crypto.control_port_ethertype;
+ sdata->control_port_no_encrypt = req->crypto.control_port_no_encrypt;
+
ieee80211_add_work(wk);
return 0;
}
@@ -2267,7 +2322,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
memcpy(bssid, req->bss->bssid, ETH_ALEN);
if (ifmgd->associated == req->bss) {
- ieee80211_set_disassoc(sdata, false);
+ ieee80211_set_disassoc(sdata, false, true);
mutex_unlock(&ifmgd->mtx);
assoc_bss = true;
} else {
@@ -2275,7 +2330,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
mutex_unlock(&ifmgd->mtx);
- mutex_lock(&local->work_mtx);
+ mutex_lock(&local->mtx);
list_for_each_entry(wk, &local->work_list, list) {
if (wk->sdata != sdata)
continue;
@@ -2294,7 +2349,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
free_work(wk);
break;
}
- mutex_unlock(&local->work_mtx);
+ mutex_unlock(&local->mtx);
/*
* If somebody requests authentication and we haven't
@@ -2319,7 +2374,9 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
if (assoc_bss)
sta_info_destroy_addr(sdata, bssid);
+ mutex_lock(&sdata->local->mtx);
ieee80211_recalc_idle(sdata->local);
+ mutex_unlock(&sdata->local->mtx);
return 0;
}
@@ -2348,7 +2405,7 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata,
sdata->name, req->bss->bssid, req->reason_code);
memcpy(bssid, req->bss->bssid, ETH_ALEN);
- ieee80211_set_disassoc(sdata, false);
+ ieee80211_set_disassoc(sdata, false, true);
mutex_unlock(&ifmgd->mtx);
@@ -2357,7 +2414,9 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata,
cookie, !req->local_state_change);
sta_info_destroy_addr(sdata, bssid);
+ mutex_lock(&sdata->local->mtx);
ieee80211_recalc_idle(sdata->local);
+ mutex_unlock(&sdata->local->mtx);
return 0;
}
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index c36b1911987..4b564091e51 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -22,12 +22,16 @@
static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
local->offchannel_ps_enabled = false;
/* FIXME: what to do when local->pspolling is true? */
del_timer_sync(&local->dynamic_ps_timer);
+ del_timer_sync(&ifmgd->bcn_mon_timer);
+ del_timer_sync(&ifmgd->conn_mon_timer);
+
cancel_work_sync(&local->dynamic_ps_enable_work);
if (local->hw.conf.flags & IEEE80211_CONF_PS) {
@@ -85,6 +89,9 @@ static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata)
mod_timer(&local->dynamic_ps_timer, jiffies +
msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout));
}
+
+ ieee80211_sta_reset_beacon_monitor(sdata);
+ ieee80211_sta_reset_conn_monitor(sdata);
}
void ieee80211_offchannel_stop_beaconing(struct ieee80211_local *local)
@@ -112,8 +119,10 @@ void ieee80211_offchannel_stop_beaconing(struct ieee80211_local *local)
* used from user space controlled off-channel operations.
*/
if (sdata->vif.type != NL80211_IFTYPE_STATION &&
- sdata->vif.type != NL80211_IFTYPE_MONITOR)
+ sdata->vif.type != NL80211_IFTYPE_MONITOR) {
+ set_bit(SDATA_STATE_OFFCHANNEL, &sdata->state);
netif_tx_stop_all_queues(sdata->dev);
+ }
}
mutex_unlock(&local->iflist_mtx);
}
@@ -131,6 +140,7 @@ void ieee80211_offchannel_stop_station(struct ieee80211_local *local)
continue;
if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+ set_bit(SDATA_STATE_OFFCHANNEL, &sdata->state);
netif_tx_stop_all_queues(sdata->dev);
if (sdata->u.mgd.associated)
ieee80211_offchannel_ps_enable(sdata);
@@ -155,8 +165,20 @@ void ieee80211_offchannel_return(struct ieee80211_local *local,
ieee80211_offchannel_ps_disable(sdata);
}
- if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
+ if (sdata->vif.type != NL80211_IFTYPE_MONITOR) {
+ clear_bit(SDATA_STATE_OFFCHANNEL, &sdata->state);
+ /*
+ * This may wake up queues even though the driver
+ * currently has them stopped. This is not very
+ * likely, since the driver won't have gotten any
+ * (or hardly any) new packets while we weren't
+ * on the right channel, and even if it happens
+ * it will at most lead to queueing up one more
+ * packet per queue in mac80211 rather than on
+ * the interface qdisc.
+ */
netif_tx_wake_all_queues(sdata->dev);
+ }
/* re-enable beaconing */
if (enable_beaconing &&
diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c
index d287fde0431..e37355193ed 100644
--- a/net/mac80211/pm.c
+++ b/net/mac80211/pm.c
@@ -45,7 +45,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw)
list_for_each_entry(sta, &local->sta_list, list) {
if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) {
set_sta_flags(sta, WLAN_STA_BLOCK_BA);
- ieee80211_sta_tear_down_BA_sessions(sta);
+ ieee80211_sta_tear_down_BA_sessions(sta, true);
}
if (sta->uploaded) {
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index 6d0bd198af1..809cf230d25 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -103,6 +103,7 @@ ieee80211_rate_control_ops_get(const char *name)
struct rate_control_ops *ops;
const char *alg_name;
+ kparam_block_sysfs_write(ieee80211_default_rc_algo);
if (!name)
alg_name = ieee80211_default_rc_algo;
else
@@ -120,6 +121,7 @@ ieee80211_rate_control_ops_get(const char *name)
/* try built-in one if specific alg requested but not found */
if (!ops && strlen(CONFIG_MAC80211_RC_DEFAULT))
ops = ieee80211_try_rate_control_ops_get(CONFIG_MAC80211_RC_DEFAULT);
+ kparam_unblock_sysfs_write(ieee80211_default_rc_algo);
return ops;
}
@@ -143,6 +145,7 @@ static ssize_t rcname_read(struct file *file, char __user *userbuf,
static const struct file_operations rcname_ops = {
.read = rcname_read,
.open = mac80211_open_file_generic,
+ .llseek = default_llseek,
};
#endif
@@ -205,7 +208,7 @@ static bool rc_no_data_or_no_ack(struct ieee80211_tx_rate_control *txrc)
fc = hdr->frame_control;
- return ((info->flags & IEEE80211_TX_CTL_NO_ACK) || !ieee80211_is_data(fc));
+ return (info->flags & IEEE80211_TX_CTL_NO_ACK) || !ieee80211_is_data(fc);
}
static void rc_send_low_broadcast(s8 *idx, u32 basic_rates, u8 max_rate_idx)
@@ -366,8 +369,8 @@ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local,
ref = rate_control_alloc(name, local);
if (!ref) {
- printk(KERN_WARNING "%s: Failed to select rate control "
- "algorithm\n", wiphy_name(local->hw.wiphy));
+ wiphy_warn(local->hw.wiphy,
+ "Failed to select rate control algorithm\n");
return -ENOENT;
}
@@ -378,9 +381,8 @@ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local,
sta_info_flush(local, NULL);
}
- printk(KERN_DEBUG "%s: Selected rate control "
- "algorithm '%s'\n", wiphy_name(local->hw.wiphy),
- ref->ops->name);
+ wiphy_debug(local->hw.wiphy, "Selected rate control algorithm '%s'\n",
+ ref->ops->name);
return 0;
}
diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c
index 241e76f3fdf..a290ad231d7 100644
--- a/net/mac80211/rc80211_minstrel_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_debugfs.c
@@ -122,6 +122,7 @@ static const struct file_operations minstrel_stat_fops = {
.open = minstrel_stats_open,
.read = minstrel_stats_read,
.release = minstrel_stats_release,
+ .llseek = default_llseek,
};
void
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index c5b465904e3..2a18d6602d4 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -397,8 +397,9 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband,
!(info->flags & IEEE80211_TX_STAT_AMPDU))
return;
- if (!info->status.ampdu_len) {
- info->status.ampdu_ack_len = 1;
+ if (!(info->flags & IEEE80211_TX_STAT_AMPDU)) {
+ info->status.ampdu_ack_len =
+ (info->flags & IEEE80211_TX_STAT_ACK ? 1 : 0);
info->status.ampdu_len = 1;
}
@@ -426,7 +427,7 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband,
group = minstrel_ht_get_group_idx(&ar[i]);
rate = &mi->groups[group].rates[ar[i].idx % 8];
- if (last && (info->flags & IEEE80211_TX_STAT_ACK))
+ if (last)
rate->success += info->status.ampdu_ack_len;
rate->attempts += ar[i].count * info->status.ampdu_len;
diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c
index 4a5a4b3e779..cefcb5d2dae 100644
--- a/net/mac80211/rc80211_minstrel_ht_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c
@@ -90,7 +90,7 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file)
MINSTREL_TRUNC(mi->avg_ampdu_len * 10) % 10);
ms->len = p - ms->buf;
- return 0;
+ return nonseekable_open(inode, file);
}
static const struct file_operations minstrel_ht_stat_fops = {
@@ -98,6 +98,7 @@ static const struct file_operations minstrel_ht_stat_fops = {
.open = minstrel_ht_stats_open,
.read = minstrel_stats_read,
.release = minstrel_stats_release,
+ .llseek = no_llseek,
};
void
diff --git a/net/mac80211/rc80211_pid_debugfs.c b/net/mac80211/rc80211_pid_debugfs.c
index 47438b4a9af..4851e9e2dae 100644
--- a/net/mac80211/rc80211_pid_debugfs.c
+++ b/net/mac80211/rc80211_pid_debugfs.c
@@ -162,7 +162,7 @@ static ssize_t rate_control_pid_events_read(struct file *file, char __user *buf,
file_info->next_entry = (file_info->next_entry + 1) %
RC_PID_EVENT_RING_SIZE;
- /* Print information about the event. Note that userpace needs to
+ /* Print information about the event. Note that userspace needs to
* provide large enough buffers. */
length = length < RC_PID_PRINT_BUF_SIZE ?
length : RC_PID_PRINT_BUF_SIZE;
@@ -206,6 +206,7 @@ static const struct file_operations rc_pid_fop_events = {
.poll = rate_control_pid_events_poll,
.open = rate_control_pid_events_open,
.release = rate_control_pid_events_release,
+ .llseek = noop_llseek,
};
void rate_control_pid_add_sta_debugfs(void *priv, void *priv_sta,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index fa0f37e4afe..902b03ee8f6 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -315,6 +315,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
static void ieee80211_parse_qos(struct ieee80211_rx_data *rx)
{
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
int tid;
/* does the frame have a qos control field? */
@@ -323,9 +324,7 @@ static void ieee80211_parse_qos(struct ieee80211_rx_data *rx)
/* frame has qos control */
tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
if (*qc & IEEE80211_QOS_CONTROL_A_MSDU_PRESENT)
- rx->flags |= IEEE80211_RX_AMSDU;
- else
- rx->flags &= ~IEEE80211_RX_AMSDU;
+ status->rx_flags |= IEEE80211_RX_AMSDU;
} else {
/*
* IEEE 802.11-2007, 7.1.3.4.1 ("Sequence Number field"):
@@ -387,26 +386,25 @@ static ieee80211_rx_result debug_noinline
ieee80211_rx_h_passive_scan(struct ieee80211_rx_data *rx)
{
struct ieee80211_local *local = rx->local;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
struct sk_buff *skb = rx->skb;
- if (unlikely(test_bit(SCAN_HW_SCANNING, &local->scanning)))
+ if (likely(!(status->rx_flags & IEEE80211_RX_IN_SCAN)))
+ return RX_CONTINUE;
+
+ if (test_bit(SCAN_HW_SCANNING, &local->scanning))
return ieee80211_scan_rx(rx->sdata, skb);
- if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning) &&
- (rx->flags & IEEE80211_RX_IN_SCAN))) {
+ if (test_bit(SCAN_SW_SCANNING, &local->scanning)) {
/* drop all the other packets during a software scan anyway */
if (ieee80211_scan_rx(rx->sdata, skb) != RX_QUEUED)
dev_kfree_skb(skb);
return RX_QUEUED;
}
- if (unlikely(rx->flags & IEEE80211_RX_IN_SCAN)) {
- /* scanning finished during invoking of handlers */
- I802_DEBUG_INC(local->rx_handlers_drop_passive_scan);
- return RX_DROP_UNUSABLE;
- }
-
- return RX_CONTINUE;
+ /* scanning finished during invoking of handlers */
+ I802_DEBUG_INC(local->rx_handlers_drop_passive_scan);
+ return RX_DROP_UNUSABLE;
}
@@ -538,20 +536,12 @@ static void ieee80211_release_reorder_frame(struct ieee80211_hw *hw,
int index,
struct sk_buff_head *frames)
{
- struct ieee80211_supported_band *sband;
- struct ieee80211_rate *rate = NULL;
struct sk_buff *skb = tid_agg_rx->reorder_buf[index];
- struct ieee80211_rx_status *status;
if (!skb)
goto no_frame;
- status = IEEE80211_SKB_RXCB(skb);
-
- /* release the reordered frames to stack */
- sband = hw->wiphy->bands[status->band];
- if (!(status->flag & RX_FLAG_HT))
- rate = &sband->bitrates[status->rate_idx];
+ /* release the frame from the reorder ring buffer */
tid_agg_rx->stored_mpdu_num--;
tid_agg_rx->reorder_buf[index] = NULL;
__skb_queue_tail(frames, skb);
@@ -580,9 +570,102 @@ static void ieee80211_release_reorder_frames(struct ieee80211_hw *hw,
* frames that have not yet been received are assumed to be lost and the skb
* can be released for processing. This may also release other skb's from the
* reorder buffer if there are no additional gaps between the frames.
+ *
+ * Callers must hold tid_agg_rx->reorder_lock.
*/
#define HT_RX_REORDER_BUF_TIMEOUT (HZ / 10)
+static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw,
+ struct tid_ampdu_rx *tid_agg_rx,
+ struct sk_buff_head *frames)
+{
+ int index, j;
+
+ /* release the buffer until next missing frame */
+ index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
+ tid_agg_rx->buf_size;
+ if (!tid_agg_rx->reorder_buf[index] &&
+ tid_agg_rx->stored_mpdu_num > 1) {
+ /*
+ * No buffers ready to be released, but check whether any
+ * frames in the reorder buffer have timed out.
+ */
+ int skipped = 1;
+ for (j = (index + 1) % tid_agg_rx->buf_size; j != index;
+ j = (j + 1) % tid_agg_rx->buf_size) {
+ if (!tid_agg_rx->reorder_buf[j]) {
+ skipped++;
+ continue;
+ }
+ if (!time_after(jiffies, tid_agg_rx->reorder_time[j] +
+ HT_RX_REORDER_BUF_TIMEOUT))
+ goto set_release_timer;
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+ if (net_ratelimit())
+ wiphy_debug(hw->wiphy,
+ "release an RX reorder frame due to timeout on earlier frames\n");
+#endif
+ ieee80211_release_reorder_frame(hw, tid_agg_rx,
+ j, frames);
+
+ /*
+ * Increment the head seq# also for the skipped slots.
+ */
+ tid_agg_rx->head_seq_num =
+ (tid_agg_rx->head_seq_num + skipped) & SEQ_MASK;
+ skipped = 0;
+ }
+ } else while (tid_agg_rx->reorder_buf[index]) {
+ ieee80211_release_reorder_frame(hw, tid_agg_rx, index, frames);
+ index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
+ tid_agg_rx->buf_size;
+ }
+
+ /*
+ * Disable the reorder release timer for now.
+ *
+ * The current implementation lacks a proper locking scheme
+ * which would protect vital statistic and debug counters
+ * from being updated by two different but concurrent BHs.
+ *
+ * More information about the topic is available from:
+ * - thread: http://marc.info/?t=128635927000001
+ *
+ * What was wrong:
+ * => http://marc.info/?l=linux-wireless&m=128636170811964
+ * "Basically the thing is that until your patch, the data
+ * in the struct didn't actually need locking because it
+ * was accessed by the RX path only which is not concurrent."
+ *
+ * List of what needs to be fixed:
+ * => http://marc.info/?l=linux-wireless&m=128656352920957
+ *
+
+ if (tid_agg_rx->stored_mpdu_num) {
+ j = index = seq_sub(tid_agg_rx->head_seq_num,
+ tid_agg_rx->ssn) % tid_agg_rx->buf_size;
+
+ for (; j != (index - 1) % tid_agg_rx->buf_size;
+ j = (j + 1) % tid_agg_rx->buf_size) {
+ if (tid_agg_rx->reorder_buf[j])
+ break;
+ }
+
+ set_release_timer:
+
+ mod_timer(&tid_agg_rx->reorder_timer,
+ tid_agg_rx->reorder_time[j] +
+ HT_RX_REORDER_BUF_TIMEOUT);
+ } else {
+ del_timer(&tid_agg_rx->reorder_timer);
+ }
+ */
+
+set_release_timer:
+ return;
+}
+
/*
* As this function belongs to the RX path it must be under
* rcu_read_lock protection. It returns false if the frame
@@ -598,14 +681,16 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw,
u16 mpdu_seq_num = (sc & IEEE80211_SCTL_SEQ) >> 4;
u16 head_seq_num, buf_size;
int index;
+ bool ret = true;
buf_size = tid_agg_rx->buf_size;
head_seq_num = tid_agg_rx->head_seq_num;
+ spin_lock(&tid_agg_rx->reorder_lock);
/* frame with out of date sequence number */
if (seq_less(mpdu_seq_num, head_seq_num)) {
dev_kfree_skb(skb);
- return true;
+ goto out;
}
/*
@@ -626,7 +711,7 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw,
/* check if we already stored this frame */
if (tid_agg_rx->reorder_buf[index]) {
dev_kfree_skb(skb);
- return true;
+ goto out;
}
/*
@@ -636,58 +721,19 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw,
if (mpdu_seq_num == tid_agg_rx->head_seq_num &&
tid_agg_rx->stored_mpdu_num == 0) {
tid_agg_rx->head_seq_num = seq_inc(tid_agg_rx->head_seq_num);
- return false;
+ ret = false;
+ goto out;
}
/* put the frame in the reordering buffer */
tid_agg_rx->reorder_buf[index] = skb;
tid_agg_rx->reorder_time[index] = jiffies;
tid_agg_rx->stored_mpdu_num++;
- /* release the buffer until next missing frame */
- index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
- tid_agg_rx->buf_size;
- if (!tid_agg_rx->reorder_buf[index] &&
- tid_agg_rx->stored_mpdu_num > 1) {
- /*
- * No buffers ready to be released, but check whether any
- * frames in the reorder buffer have timed out.
- */
- int j;
- int skipped = 1;
- for (j = (index + 1) % tid_agg_rx->buf_size; j != index;
- j = (j + 1) % tid_agg_rx->buf_size) {
- if (!tid_agg_rx->reorder_buf[j]) {
- skipped++;
- continue;
- }
- if (!time_after(jiffies, tid_agg_rx->reorder_time[j] +
- HT_RX_REORDER_BUF_TIMEOUT))
- break;
+ ieee80211_sta_reorder_release(hw, tid_agg_rx, frames);
-#ifdef CONFIG_MAC80211_HT_DEBUG
- if (net_ratelimit())
- printk(KERN_DEBUG "%s: release an RX reorder "
- "frame due to timeout on earlier "
- "frames\n",
- wiphy_name(hw->wiphy));
-#endif
- ieee80211_release_reorder_frame(hw, tid_agg_rx,
- j, frames);
-
- /*
- * Increment the head seq# also for the skipped slots.
- */
- tid_agg_rx->head_seq_num =
- (tid_agg_rx->head_seq_num + skipped) & SEQ_MASK;
- skipped = 0;
- }
- } else while (tid_agg_rx->reorder_buf[index]) {
- ieee80211_release_reorder_frame(hw, tid_agg_rx, index, frames);
- index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
- tid_agg_rx->buf_size;
- }
-
- return true;
+ out:
+ spin_unlock(&tid_agg_rx->reorder_lock);
+ return ret;
}
/*
@@ -761,13 +807,14 @@ static ieee80211_rx_result debug_noinline
ieee80211_rx_h_check(struct ieee80211_rx_data *rx)
{
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
/* Drop duplicate 802.11 retransmissions (IEEE 802.11 Chap. 9.2.9) */
if (rx->sta && !is_multicast_ether_addr(hdr->addr1)) {
if (unlikely(ieee80211_has_retry(hdr->frame_control) &&
rx->sta->last_seq_ctrl[rx->queue] ==
hdr->seq_ctrl)) {
- if (rx->flags & IEEE80211_RX_RA_MATCH) {
+ if (status->rx_flags & IEEE80211_RX_RA_MATCH) {
rx->local->dot11FrameDuplicateCount++;
rx->sta->num_duplicates++;
}
@@ -796,11 +843,12 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx)
if (unlikely((ieee80211_is_data(hdr->frame_control) ||
ieee80211_is_pspoll(hdr->frame_control)) &&
rx->sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+ rx->sdata->vif.type != NL80211_IFTYPE_WDS &&
(!rx->sta || !test_sta_flags(rx->sta, WLAN_STA_ASSOC)))) {
if ((!ieee80211_has_fromds(hdr->frame_control) &&
!ieee80211_has_tods(hdr->frame_control) &&
ieee80211_is_data(hdr->frame_control)) ||
- !(rx->flags & IEEE80211_RX_RA_MATCH)) {
+ !(status->rx_flags & IEEE80211_RX_RA_MATCH)) {
/* Drop IBSS frames and frames for other hosts
* silently. */
return RX_DROP_MONITOR;
@@ -822,7 +870,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
int keyidx;
int hdrlen;
ieee80211_rx_result result = RX_DROP_UNUSABLE;
- struct ieee80211_key *stakey = NULL;
+ struct ieee80211_key *sta_ptk = NULL;
int mmie_keyidx = -1;
__le16 fc;
@@ -857,22 +905,25 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
* No point in finding a key and decrypting if the frame is neither
* addressed to us nor a multicast frame.
*/
- if (!(rx->flags & IEEE80211_RX_RA_MATCH))
+ if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
return RX_CONTINUE;
/* start without a key */
rx->key = NULL;
if (rx->sta)
- stakey = rcu_dereference(rx->sta->key);
+ sta_ptk = rcu_dereference(rx->sta->ptk);
fc = hdr->frame_control;
if (!ieee80211_has_protected(fc))
mmie_keyidx = ieee80211_get_mmie_keyidx(rx->skb);
- if (!is_multicast_ether_addr(hdr->addr1) && stakey) {
- rx->key = stakey;
+ if (!is_multicast_ether_addr(hdr->addr1) && sta_ptk) {
+ rx->key = sta_ptk;
+ if ((status->flag & RX_FLAG_DECRYPTED) &&
+ (status->flag & RX_FLAG_IV_STRIPPED))
+ return RX_CONTINUE;
/* Skip decryption if the frame is not protected. */
if (!ieee80211_has_protected(fc))
return RX_CONTINUE;
@@ -885,7 +936,10 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
if (mmie_keyidx < NUM_DEFAULT_KEYS ||
mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
return RX_DROP_MONITOR; /* unexpected BIP keyidx */
- rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]);
+ if (rx->sta)
+ rx->key = rcu_dereference(rx->sta->gtk[mmie_keyidx]);
+ if (!rx->key)
+ rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]);
} else if (!ieee80211_has_protected(fc)) {
/*
* The frame was not protected, so skip decryption. However, we
@@ -928,16 +982,25 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
skb_copy_bits(rx->skb, hdrlen + 3, &keyid, 1);
keyidx = keyid >> 6;
- rx->key = rcu_dereference(rx->sdata->keys[keyidx]);
+ /* check per-station GTK first, if multicast packet */
+ if (is_multicast_ether_addr(hdr->addr1) && rx->sta)
+ rx->key = rcu_dereference(rx->sta->gtk[keyidx]);
- /*
- * RSNA-protected unicast frames should always be sent with
- * pairwise or station-to-station keys, but for WEP we allow
- * using a key index as well.
- */
- if (rx->key && rx->key->conf.alg != ALG_WEP &&
- !is_multicast_ether_addr(hdr->addr1))
- rx->key = NULL;
+ /* if not found, try default key */
+ if (!rx->key) {
+ rx->key = rcu_dereference(rx->sdata->keys[keyidx]);
+
+ /*
+ * RSNA-protected unicast frames should always be
+ * sent with pairwise or station-to-station keys,
+ * but for WEP we allow using a key index as well.
+ */
+ if (rx->key &&
+ rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP40 &&
+ rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP104 &&
+ !is_multicast_ether_addr(hdr->addr1))
+ rx->key = NULL;
+ }
}
if (rx->key) {
@@ -951,8 +1014,9 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
return RX_DROP_UNUSABLE;
/* the hdr variable is invalid now! */
- switch (rx->key->conf.alg) {
- case ALG_WEP:
+ switch (rx->key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
/* Check for weak IVs if possible */
if (rx->sta && ieee80211_is_data(fc) &&
(!(status->flag & RX_FLAG_IV_STRIPPED) ||
@@ -962,15 +1026,21 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
result = ieee80211_crypto_wep_decrypt(rx);
break;
- case ALG_TKIP:
+ case WLAN_CIPHER_SUITE_TKIP:
result = ieee80211_crypto_tkip_decrypt(rx);
break;
- case ALG_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP:
result = ieee80211_crypto_ccmp_decrypt(rx);
break;
- case ALG_AES_CMAC:
+ case WLAN_CIPHER_SUITE_AES_CMAC:
result = ieee80211_crypto_aes_cmac_decrypt(rx);
break;
+ default:
+ /*
+ * We can reach here only with HW-only algorithms
+ * but why didn't it decrypt the frame?!
+ */
+ return RX_DROP_UNUSABLE;
}
/* either the frame has been decrypted or will be dropped */
@@ -1079,7 +1149,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
sta->last_rx = jiffies;
}
- if (!(rx->flags & IEEE80211_RX_RA_MATCH))
+ if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
return RX_CONTINUE;
if (rx->sdata->vif.type == NL80211_IFTYPE_STATION)
@@ -1236,6 +1306,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
unsigned int frag, seq;
struct ieee80211_fragment_entry *entry;
struct sk_buff *skb;
+ struct ieee80211_rx_status *status;
hdr = (struct ieee80211_hdr *)rx->skb->data;
fc = hdr->frame_control;
@@ -1265,7 +1336,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
/* This is the first fragment of a new frame. */
entry = ieee80211_reassemble_add(rx->sdata, frag, seq,
rx->queue, &(rx->skb));
- if (rx->key && rx->key->conf.alg == ALG_CCMP &&
+ if (rx->key && rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP &&
ieee80211_has_protected(fc)) {
int queue = ieee80211_is_mgmt(fc) ?
NUM_RX_DATA_QUEUES : rx->queue;
@@ -1294,7 +1365,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
int i;
u8 pn[CCMP_PN_LEN], *rpn;
int queue;
- if (!rx->key || rx->key->conf.alg != ALG_CCMP)
+ if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP)
return RX_DROP_UNUSABLE;
memcpy(pn, entry->last_pn, CCMP_PN_LEN);
for (i = CCMP_PN_LEN - 1; i >= 0; i--) {
@@ -1335,7 +1406,8 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
}
/* Complete frame has been reassembled - process it now */
- rx->flags |= IEEE80211_RX_FRAGMENTED;
+ status = IEEE80211_SKB_RXCB(rx->skb);
+ status->rx_flags |= IEEE80211_RX_FRAGMENTED;
out:
if (rx->sta)
@@ -1352,9 +1424,10 @@ ieee80211_rx_h_ps_poll(struct ieee80211_rx_data *rx)
{
struct ieee80211_sub_if_data *sdata = rx->sdata;
__le16 fc = ((struct ieee80211_hdr *)rx->skb->data)->frame_control;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
if (likely(!rx->sta || !ieee80211_is_pspoll(fc) ||
- !(rx->flags & IEEE80211_RX_RA_MATCH)))
+ !(status->rx_flags & IEEE80211_RX_RA_MATCH)))
return RX_CONTINUE;
if ((sdata->vif.type != NL80211_IFTYPE_AP) &&
@@ -1492,7 +1565,7 @@ static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx, __le16 fc)
* Allow EAPOL frames to us/the PAE group address regardless
* of whether the frame was encrypted or not.
*/
- if (ehdr->h_proto == htons(ETH_P_PAE) &&
+ if (ehdr->h_proto == rx->sdata->control_port_protocol &&
(compare_ether_addr(ehdr->h_dest, rx->sdata->vif.addr) == 0 ||
compare_ether_addr(ehdr->h_dest, pae_group_addr) == 0))
return true;
@@ -1515,6 +1588,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
struct sk_buff *skb, *xmit_skb;
struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data;
struct sta_info *dsta;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
skb = rx->skb;
xmit_skb = NULL;
@@ -1522,7 +1596,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
if ((sdata->vif.type == NL80211_IFTYPE_AP ||
sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
!(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
- (rx->flags & IEEE80211_RX_RA_MATCH) &&
+ (status->rx_flags & IEEE80211_RX_RA_MATCH) &&
(sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) {
if (is_multicast_ether_addr(ehdr->h_dest)) {
/*
@@ -1599,6 +1673,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
__le16 fc = hdr->frame_control;
struct sk_buff_head frame_list;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
if (unlikely(!ieee80211_is_data(fc)))
return RX_CONTINUE;
@@ -1606,7 +1681,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
if (unlikely(!ieee80211_is_data_present(fc)))
return RX_DROP_MONITOR;
- if (!(rx->flags & IEEE80211_RX_AMSDU))
+ if (!(status->rx_flags & IEEE80211_RX_AMSDU))
return RX_CONTINUE;
if (ieee80211_has_a4(hdr->frame_control) &&
@@ -1657,6 +1732,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
struct sk_buff *skb = rx->skb, *fwd_skb;
struct ieee80211_local *local = rx->local;
struct ieee80211_sub_if_data *sdata = rx->sdata;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
hdr = (struct ieee80211_hdr *) skb->data;
hdrlen = ieee80211_hdrlen(hdr->frame_control);
@@ -1702,7 +1778,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
mesh_hdr->ttl--;
- if (rx->flags & IEEE80211_RX_RA_MATCH) {
+ if (status->rx_flags & IEEE80211_RX_RA_MATCH) {
if (!mesh_hdr->ttl)
IEEE80211_IFSTA_MESH_CTR_INC(&rx->sdata->u.mesh,
dropped_frames_ttl);
@@ -1909,13 +1985,38 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata,
}
static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+ /*
+ * From here on, look only at management frames.
+ * Data and control frames are already handled,
+ * and unknown (reserved) frames are useless.
+ */
+ if (rx->skb->len < 24)
+ return RX_DROP_MONITOR;
+
+ if (!ieee80211_is_mgmt(mgmt->frame_control))
+ return RX_DROP_MONITOR;
+
+ if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
+ return RX_DROP_MONITOR;
+
+ if (ieee80211_drop_unencrypted_mgmt(rx))
+ return RX_DROP_UNUSABLE;
+
+ return RX_CONTINUE;
+}
+
+static ieee80211_rx_result debug_noinline
ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
{
struct ieee80211_local *local = rx->local;
struct ieee80211_sub_if_data *sdata = rx->sdata;
struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
- struct sk_buff *nskb;
- struct ieee80211_rx_status *status;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
int len = rx->skb->len;
if (!ieee80211_is_action(mgmt->frame_control))
@@ -1928,10 +2029,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
if (!rx->sta && mgmt->u.action.category != WLAN_CATEGORY_PUBLIC)
return RX_DROP_UNUSABLE;
- if (!(rx->flags & IEEE80211_RX_RA_MATCH))
- return RX_DROP_UNUSABLE;
-
- if (ieee80211_drop_unencrypted_mgmt(rx))
+ if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
return RX_DROP_UNUSABLE;
switch (mgmt->u.action.category) {
@@ -2024,17 +2122,36 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
goto queue;
}
+ return RX_CONTINUE;
+
invalid:
- /*
- * For AP mode, hostapd is responsible for handling any action
- * frames that we didn't handle, including returning unknown
- * ones. For all other modes we will return them to the sender,
- * setting the 0x80 bit in the action category, as required by
- * 802.11-2007 7.3.1.11.
- */
- if (sdata->vif.type == NL80211_IFTYPE_AP ||
- sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
- return RX_DROP_MONITOR;
+ status->rx_flags |= IEEE80211_RX_MALFORMED_ACTION_FRM;
+ /* will return in the next handlers */
+ return RX_CONTINUE;
+
+ handled:
+ if (rx->sta)
+ rx->sta->rx_packets++;
+ dev_kfree_skb(rx->skb);
+ return RX_QUEUED;
+
+ queue:
+ rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
+ skb_queue_tail(&sdata->skb_queue, rx->skb);
+ ieee80211_queue_work(&local->hw, &sdata->work);
+ if (rx->sta)
+ rx->sta->rx_packets++;
+ return RX_QUEUED;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+ /* skip known-bad action frames and return them in the next handler */
+ if (status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM)
+ return RX_CONTINUE;
/*
* Getting here means the kernel doesn't know how to handle
@@ -2042,12 +2159,46 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
* so userspace can register for those to know whether ones
* it transmitted were processed or returned.
*/
- status = IEEE80211_SKB_RXCB(rx->skb);
- if (cfg80211_rx_action(rx->sdata->dev, status->freq,
- rx->skb->data, rx->skb->len,
- GFP_ATOMIC))
- goto handled;
+ if (cfg80211_rx_mgmt(rx->sdata->dev, status->freq,
+ rx->skb->data, rx->skb->len,
+ GFP_ATOMIC)) {
+ if (rx->sta)
+ rx->sta->rx_packets++;
+ dev_kfree_skb(rx->skb);
+ return RX_QUEUED;
+ }
+
+
+ return RX_CONTINUE;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx)
+{
+ struct ieee80211_local *local = rx->local;
+ struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
+ struct sk_buff *nskb;
+ struct ieee80211_sub_if_data *sdata = rx->sdata;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+ if (!ieee80211_is_action(mgmt->frame_control))
+ return RX_CONTINUE;
+
+ /*
+ * For AP mode, hostapd is responsible for handling any action
+ * frames that we didn't handle, including returning unknown
+ * ones. For all other modes we will return them to the sender,
+ * setting the 0x80 bit in the action category, as required by
+ * 802.11-2007 7.3.1.11.
+ * Newer versions of hostapd shall also use the management frame
+ * registration mechanisms, but older ones still use cooked
+ * monitor interfaces so push all frames there.
+ */
+ if (!(status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM) &&
+ (sdata->vif.type == NL80211_IFTYPE_AP ||
+ sdata->vif.type == NL80211_IFTYPE_AP_VLAN))
+ return RX_DROP_MONITOR;
/* do not return rejected action frames */
if (mgmt->u.action.category & 0x80)
@@ -2066,20 +2217,8 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
ieee80211_tx_skb(rx->sdata, nskb);
}
-
- handled:
- if (rx->sta)
- rx->sta->rx_packets++;
dev_kfree_skb(rx->skb);
return RX_QUEUED;
-
- queue:
- rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
- skb_queue_tail(&sdata->skb_queue, rx->skb);
- ieee80211_queue_work(&local->hw, &sdata->work);
- if (rx->sta)
- rx->sta->rx_packets++;
- return RX_QUEUED;
}
static ieee80211_rx_result debug_noinline
@@ -2090,15 +2229,6 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
struct ieee80211_mgmt *mgmt = (void *)rx->skb->data;
__le16 stype;
- if (!(rx->flags & IEEE80211_RX_RA_MATCH))
- return RX_DROP_MONITOR;
-
- if (rx->skb->len < 24)
- return RX_DROP_MONITOR;
-
- if (ieee80211_drop_unencrypted_mgmt(rx))
- return RX_DROP_UNUSABLE;
-
rxs = ieee80211_work_rx_mgmt(rx->sdata, rx->skb);
if (rxs != RX_CONTINUE)
return rxs;
@@ -2199,8 +2329,13 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
struct net_device *prev_dev = NULL;
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
- if (status->flag & RX_FLAG_INTERNAL_CMTR)
+ /*
+ * If cooked monitor has been processed already, then
+ * don't do it again. If not, set the flag.
+ */
+ if (rx->flags & IEEE80211_RX_CMNTR)
goto out_free_skb;
+ rx->flags |= IEEE80211_RX_CMNTR;
if (skb_headroom(skb) < sizeof(*rthdr) &&
pskb_expand_head(skb, sizeof(*rthdr), 0, GFP_ATOMIC))
@@ -2256,30 +2391,53 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
if (prev_dev) {
skb->dev = prev_dev;
netif_receive_skb(skb);
- skb = NULL;
- } else
- goto out_free_skb;
-
- status->flag |= RX_FLAG_INTERNAL_CMTR;
- return;
+ return;
+ }
out_free_skb:
dev_kfree_skb(skb);
}
+static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx,
+ ieee80211_rx_result res)
+{
+ switch (res) {
+ case RX_DROP_MONITOR:
+ I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
+ if (rx->sta)
+ rx->sta->rx_dropped++;
+ /* fall through */
+ case RX_CONTINUE: {
+ struct ieee80211_rate *rate = NULL;
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_rx_status *status;
+
+ status = IEEE80211_SKB_RXCB((rx->skb));
+
+ sband = rx->local->hw.wiphy->bands[status->band];
+ if (!(status->flag & RX_FLAG_HT))
+ rate = &sband->bitrates[status->rate_idx];
+
+ ieee80211_rx_cooked_monitor(rx, rate);
+ break;
+ }
+ case RX_DROP_UNUSABLE:
+ I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
+ if (rx->sta)
+ rx->sta->rx_dropped++;
+ dev_kfree_skb(rx->skb);
+ break;
+ case RX_QUEUED:
+ I802_DEBUG_INC(rx->sdata->local->rx_handlers_queued);
+ break;
+ }
+}
-static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_rx_data *rx,
- struct sk_buff *skb,
- struct ieee80211_rate *rate)
+static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx,
+ struct sk_buff_head *frames)
{
- struct sk_buff_head reorder_release;
ieee80211_rx_result res = RX_DROP_MONITOR;
-
- __skb_queue_head_init(&reorder_release);
-
- rx->skb = skb;
- rx->sdata = sdata;
+ struct sk_buff *skb;
#define CALL_RXH(rxh) \
do { \
@@ -2288,23 +2446,14 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata,
goto rxh_next; \
} while (0);
- /*
- * NB: the rxh_next label works even if we jump
- * to it from here because then the list will
- * be empty, which is a trivial check
- */
- CALL_RXH(ieee80211_rx_h_passive_scan)
- CALL_RXH(ieee80211_rx_h_check)
-
- ieee80211_rx_reorder_ampdu(rx, &reorder_release);
-
- while ((skb = __skb_dequeue(&reorder_release))) {
+ while ((skb = __skb_dequeue(frames))) {
/*
* all the other fields are valid across frames
* that belong to an aMPDU since they are on the
* same TID from the same station
*/
rx->skb = skb;
+ rx->flags = 0;
CALL_RXH(ieee80211_rx_h_decrypt)
CALL_RXH(ieee80211_rx_h_check_more_data)
@@ -2316,50 +2465,92 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata,
CALL_RXH(ieee80211_rx_h_remove_qos_control)
CALL_RXH(ieee80211_rx_h_amsdu)
#ifdef CONFIG_MAC80211_MESH
- if (ieee80211_vif_is_mesh(&sdata->vif))
+ if (ieee80211_vif_is_mesh(&rx->sdata->vif))
CALL_RXH(ieee80211_rx_h_mesh_fwding);
#endif
CALL_RXH(ieee80211_rx_h_data)
/* special treatment -- needs the queue */
- res = ieee80211_rx_h_ctrl(rx, &reorder_release);
+ res = ieee80211_rx_h_ctrl(rx, frames);
if (res != RX_CONTINUE)
goto rxh_next;
+ CALL_RXH(ieee80211_rx_h_mgmt_check)
CALL_RXH(ieee80211_rx_h_action)
+ CALL_RXH(ieee80211_rx_h_userspace_mgmt)
+ CALL_RXH(ieee80211_rx_h_action_return)
CALL_RXH(ieee80211_rx_h_mgmt)
+ rxh_next:
+ ieee80211_rx_handlers_result(rx, res);
+
#undef CALL_RXH
+ }
+}
+
+static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx)
+{
+ struct sk_buff_head reorder_release;
+ ieee80211_rx_result res = RX_DROP_MONITOR;
+
+ __skb_queue_head_init(&reorder_release);
+
+#define CALL_RXH(rxh) \
+ do { \
+ res = rxh(rx); \
+ if (res != RX_CONTINUE) \
+ goto rxh_next; \
+ } while (0);
+
+ CALL_RXH(ieee80211_rx_h_passive_scan)
+ CALL_RXH(ieee80211_rx_h_check)
+
+ ieee80211_rx_reorder_ampdu(rx, &reorder_release);
+
+ ieee80211_rx_handlers(rx, &reorder_release);
+ return;
rxh_next:
- switch (res) {
- case RX_DROP_MONITOR:
- I802_DEBUG_INC(sdata->local->rx_handlers_drop);
- if (rx->sta)
- rx->sta->rx_dropped++;
- /* fall through */
- case RX_CONTINUE:
- ieee80211_rx_cooked_monitor(rx, rate);
- break;
- case RX_DROP_UNUSABLE:
- I802_DEBUG_INC(sdata->local->rx_handlers_drop);
- if (rx->sta)
- rx->sta->rx_dropped++;
- dev_kfree_skb(rx->skb);
- break;
- case RX_QUEUED:
- I802_DEBUG_INC(sdata->local->rx_handlers_queued);
- break;
- }
- }
+ ieee80211_rx_handlers_result(rx, res);
+
+#undef CALL_RXH
+}
+
+/*
+ * This function makes calls into the RX path. Therefore the
+ * caller must hold the sta_info->lock and everything has to
+ * be under rcu_read_lock protection as well.
+ */
+void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid)
+{
+ struct sk_buff_head frames;
+ struct ieee80211_rx_data rx = {
+ .sta = sta,
+ .sdata = sta->sdata,
+ .local = sta->local,
+ .queue = tid,
+ };
+ struct tid_ampdu_rx *tid_agg_rx;
+
+ tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
+ if (!tid_agg_rx)
+ return;
+
+ __skb_queue_head_init(&frames);
+
+ spin_lock(&tid_agg_rx->reorder_lock);
+ ieee80211_sta_reorder_release(&sta->local->hw, tid_agg_rx, &frames);
+ spin_unlock(&tid_agg_rx->reorder_lock);
+
+ ieee80211_rx_handlers(&rx, &frames);
}
/* main receive path */
-static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_rx_data *rx,
+static int prepare_for_handlers(struct ieee80211_rx_data *rx,
struct ieee80211_hdr *hdr)
{
+ struct ieee80211_sub_if_data *sdata = rx->sdata;
struct sk_buff *skb = rx->skb;
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
u8 *bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type);
@@ -2373,7 +2564,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
compare_ether_addr(sdata->vif.addr, hdr->addr1) != 0) {
if (!(sdata->dev->flags & IFF_PROMISC))
return 0;
- rx->flags &= ~IEEE80211_RX_RA_MATCH;
+ status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
}
break;
case NL80211_IFTYPE_ADHOC:
@@ -2383,15 +2574,15 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
return 1;
}
else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) {
- if (!(rx->flags & IEEE80211_RX_IN_SCAN))
+ if (!(status->rx_flags & IEEE80211_RX_IN_SCAN))
return 0;
- rx->flags &= ~IEEE80211_RX_RA_MATCH;
+ status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
} else if (!multicast &&
compare_ether_addr(sdata->vif.addr,
hdr->addr1) != 0) {
if (!(sdata->dev->flags & IFF_PROMISC))
return 0;
- rx->flags &= ~IEEE80211_RX_RA_MATCH;
+ status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
} else if (!rx->sta) {
int rate_idx;
if (status->flag & RX_FLAG_HT)
@@ -2409,7 +2600,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
if (!(sdata->dev->flags & IFF_PROMISC))
return 0;
- rx->flags &= ~IEEE80211_RX_RA_MATCH;
+ status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
}
break;
case NL80211_IFTYPE_AP_VLAN:
@@ -2420,9 +2611,9 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
return 0;
} else if (!ieee80211_bssid_match(bssid,
sdata->vif.addr)) {
- if (!(rx->flags & IEEE80211_RX_IN_SCAN))
+ if (!(status->rx_flags & IEEE80211_RX_IN_SCAN))
return 0;
- rx->flags &= ~IEEE80211_RX_RA_MATCH;
+ status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
}
break;
case NL80211_IFTYPE_WDS:
@@ -2431,9 +2622,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
if (compare_ether_addr(sdata->u.wds.remote_addr, hdr->addr2))
return 0;
break;
- case NL80211_IFTYPE_MONITOR:
- case NL80211_IFTYPE_UNSPECIFIED:
- case __NL80211_IFTYPE_AFTER_LAST:
+ default:
/* should never get here */
WARN_ON(1);
break;
@@ -2443,12 +2632,56 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
}
/*
+ * This function returns whether or not the SKB
+ * was destined for RX processing or not, which,
+ * if consume is true, is equivalent to whether
+ * or not the skb was consumed.
+ */
+static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx,
+ struct sk_buff *skb, bool consume)
+{
+ struct ieee80211_local *local = rx->local;
+ struct ieee80211_sub_if_data *sdata = rx->sdata;
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ struct ieee80211_hdr *hdr = (void *)skb->data;
+ int prepares;
+
+ rx->skb = skb;
+ status->rx_flags |= IEEE80211_RX_RA_MATCH;
+ prepares = prepare_for_handlers(rx, hdr);
+
+ if (!prepares)
+ return false;
+
+ if (status->flag & RX_FLAG_MMIC_ERROR) {
+ if (status->rx_flags & IEEE80211_RX_RA_MATCH)
+ ieee80211_rx_michael_mic_report(hdr, rx);
+ return false;
+ }
+
+ if (!consume) {
+ skb = skb_copy(skb, GFP_ATOMIC);
+ if (!skb) {
+ if (net_ratelimit())
+ wiphy_debug(local->hw.wiphy,
+ "failed to copy multicast frame for %s\n",
+ sdata->name);
+ return true;
+ }
+
+ rx->skb = skb;
+ }
+
+ ieee80211_invoke_rx_handlers(rx);
+ return true;
+}
+
+/*
* This is the actual Rx frames handler. as it blongs to Rx path it must
* be called with rcu_read_lock protection.
*/
static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
- struct sk_buff *skb,
- struct ieee80211_rate *rate)
+ struct sk_buff *skb)
{
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
struct ieee80211_local *local = hw_to_local(hw);
@@ -2456,11 +2689,8 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
struct ieee80211_hdr *hdr;
__le16 fc;
struct ieee80211_rx_data rx;
- int prepares;
- struct ieee80211_sub_if_data *prev = NULL;
- struct sk_buff *skb_new;
- struct sta_info *sta, *tmp;
- bool found_sta = false;
+ struct ieee80211_sub_if_data *prev;
+ struct sta_info *sta, *tmp, *prev_sta;
int err = 0;
fc = ((struct ieee80211_hdr *)skb->data)->frame_control;
@@ -2473,7 +2703,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
if (unlikely(test_bit(SCAN_HW_SCANNING, &local->scanning) ||
test_bit(SCAN_OFF_CHANNEL, &local->scanning)))
- rx.flags |= IEEE80211_RX_IN_SCAN;
+ status->rx_flags |= IEEE80211_RX_IN_SCAN;
if (ieee80211_is_mgmt(fc))
err = skb_linearize(skb);
@@ -2490,91 +2720,67 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
ieee80211_verify_alignment(&rx);
if (ieee80211_is_data(fc)) {
+ prev_sta = NULL;
+
for_each_sta_info(local, hdr->addr2, sta, tmp) {
- rx.sta = sta;
- found_sta = true;
- rx.sdata = sta->sdata;
-
- rx.flags |= IEEE80211_RX_RA_MATCH;
- prepares = prepare_for_handlers(rx.sdata, &rx, hdr);
- if (prepares) {
- if (status->flag & RX_FLAG_MMIC_ERROR) {
- if (rx.flags & IEEE80211_RX_RA_MATCH)
- ieee80211_rx_michael_mic_report(hdr, &rx);
- } else
- prev = rx.sdata;
- }
- }
- }
- if (!found_sta) {
- list_for_each_entry_rcu(sdata, &local->interfaces, list) {
- if (!ieee80211_sdata_running(sdata))
+ if (!prev_sta) {
+ prev_sta = sta;
continue;
+ }
- if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
- sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
- continue;
+ rx.sta = prev_sta;
+ rx.sdata = prev_sta->sdata;
+ ieee80211_prepare_and_rx_handle(&rx, skb, false);
- /*
- * frame is destined for this interface, but if it's
- * not also for the previous one we handle that after
- * the loop to avoid copying the SKB once too much
- */
+ prev_sta = sta;
+ }
- if (!prev) {
- prev = sdata;
- continue;
- }
+ if (prev_sta) {
+ rx.sta = prev_sta;
+ rx.sdata = prev_sta->sdata;
- rx.sta = sta_info_get_bss(prev, hdr->addr2);
+ if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
+ return;
+ }
+ }
- rx.flags |= IEEE80211_RX_RA_MATCH;
- prepares = prepare_for_handlers(prev, &rx, hdr);
+ prev = NULL;
- if (!prepares)
- goto next;
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (!ieee80211_sdata_running(sdata))
+ continue;
- if (status->flag & RX_FLAG_MMIC_ERROR) {
- rx.sdata = prev;
- if (rx.flags & IEEE80211_RX_RA_MATCH)
- ieee80211_rx_michael_mic_report(hdr,
- &rx);
- goto next;
- }
+ if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
+ sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ continue;
- /*
- * frame was destined for the previous interface
- * so invoke RX handlers for it
- */
+ /*
+ * frame is destined for this interface, but if it's
+ * not also for the previous one we handle that after
+ * the loop to avoid copying the SKB once too much
+ */
- skb_new = skb_copy(skb, GFP_ATOMIC);
- if (!skb_new) {
- if (net_ratelimit())
- printk(KERN_DEBUG "%s: failed to copy "
- "multicast frame for %s\n",
- wiphy_name(local->hw.wiphy),
- prev->name);
- goto next;
- }
- ieee80211_invoke_rx_handlers(prev, &rx, skb_new, rate);
-next:
+ if (!prev) {
prev = sdata;
+ continue;
}
- if (prev) {
- rx.sta = sta_info_get_bss(prev, hdr->addr2);
+ rx.sta = sta_info_get_bss(prev, hdr->addr2);
+ rx.sdata = prev;
+ ieee80211_prepare_and_rx_handle(&rx, skb, false);
- rx.flags |= IEEE80211_RX_RA_MATCH;
- prepares = prepare_for_handlers(prev, &rx, hdr);
+ prev = sdata;
+ }
- if (!prepares)
- prev = NULL;
- }
+ if (prev) {
+ rx.sta = sta_info_get_bss(prev, hdr->addr2);
+ rx.sdata = prev;
+
+ if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
+ return;
}
- if (prev)
- ieee80211_invoke_rx_handlers(prev, &rx, skb, rate);
- else
- dev_kfree_skb(skb);
+
+ dev_kfree_skb(skb);
}
/*
@@ -2615,30 +2821,41 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
if (WARN_ON(!local->started))
goto drop;
- if (status->flag & RX_FLAG_HT) {
+ if (likely(!(status->flag & RX_FLAG_FAILED_PLCP_CRC))) {
/*
- * rate_idx is MCS index, which can be [0-76] as documented on:
- *
- * http://wireless.kernel.org/en/developers/Documentation/ieee80211/802.11n
- *
- * Anything else would be some sort of driver or hardware error.
- * The driver should catch hardware errors.
+ * Validate the rate, unless a PLCP error means that
+ * we probably can't have a valid rate here anyway.
*/
- if (WARN((status->rate_idx < 0 ||
- status->rate_idx > 76),
- "Rate marked as an HT rate but passed "
- "status->rate_idx is not "
- "an MCS index [0-76]: %d (0x%02x)\n",
- status->rate_idx,
- status->rate_idx))
- goto drop;
- } else {
- if (WARN_ON(status->rate_idx < 0 ||
- status->rate_idx >= sband->n_bitrates))
- goto drop;
- rate = &sband->bitrates[status->rate_idx];
+
+ if (status->flag & RX_FLAG_HT) {
+ /*
+ * rate_idx is MCS index, which can be [0-76]
+ * as documented on:
+ *
+ * http://wireless.kernel.org/en/developers/Documentation/ieee80211/802.11n
+ *
+ * Anything else would be some sort of driver or
+ * hardware error. The driver should catch hardware
+ * errors.
+ */
+ if (WARN((status->rate_idx < 0 ||
+ status->rate_idx > 76),
+ "Rate marked as an HT rate but passed "
+ "status->rate_idx is not "
+ "an MCS index [0-76]: %d (0x%02x)\n",
+ status->rate_idx,
+ status->rate_idx))
+ goto drop;
+ } else {
+ if (WARN_ON(status->rate_idx < 0 ||
+ status->rate_idx >= sband->n_bitrates))
+ goto drop;
+ rate = &sband->bitrates[status->rate_idx];
+ }
}
+ status->rx_flags = 0;
+
/*
* key references and virtual interfaces are protected using RCU
* and this requires that we are in a read-side RCU section during
@@ -2658,7 +2875,7 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
return;
}
- __ieee80211_rx_handle_packet(hw, skb, rate);
+ __ieee80211_rx_handle_packet(hw, skb);
rcu_read_unlock();
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 872d7b6ef6b..fb274db77e3 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -242,20 +242,19 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
local->hw_scan_req->n_channels = n_chans;
ielen = ieee80211_build_preq_ies(local, (u8 *)local->hw_scan_req->ie,
- req->ie, req->ie_len, band);
+ req->ie, req->ie_len, band, (u32) -1,
+ 0);
local->hw_scan_req->ie_len = ielen;
return true;
}
-void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
+static bool __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted,
+ bool was_hw_scan)
{
struct ieee80211_local *local = hw_to_local(hw);
- bool was_hw_scan;
-
- trace_api_scan_completed(local, aborted);
- mutex_lock(&local->scan_mtx);
+ lockdep_assert_held(&local->mtx);
/*
* It's ok to abort a not-yet-running scan (that
@@ -266,17 +265,13 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
if (WARN_ON(!local->scanning && !aborted))
aborted = true;
- if (WARN_ON(!local->scan_req)) {
- mutex_unlock(&local->scan_mtx);
- return;
- }
+ if (WARN_ON(!local->scan_req))
+ return false;
- was_hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning);
if (was_hw_scan && !aborted && ieee80211_prep_hw_scan(local)) {
- ieee80211_queue_delayed_work(&local->hw,
- &local->scan_work, 0);
- mutex_unlock(&local->scan_mtx);
- return;
+ int rc = drv_hw_scan(local, local->scan_sdata, local->hw_scan_req);
+ if (rc == 0)
+ return false;
}
kfree(local->hw_scan_req);
@@ -290,26 +285,42 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
local->scanning = 0;
local->scan_channel = NULL;
- /* we only have to protect scan_req and hw/sw scan */
- mutex_unlock(&local->scan_mtx);
-
- ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
- if (was_hw_scan)
- goto done;
-
- ieee80211_configure_filter(local);
+ return true;
+}
- drv_sw_scan_complete(local);
+static void __ieee80211_scan_completed_finish(struct ieee80211_hw *hw,
+ bool was_hw_scan)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
- ieee80211_offchannel_return(local, true);
+ ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
+ if (!was_hw_scan) {
+ ieee80211_configure_filter(local);
+ drv_sw_scan_complete(local);
+ ieee80211_offchannel_return(local, true);
+ }
- done:
+ mutex_lock(&local->mtx);
ieee80211_recalc_idle(local);
+ mutex_unlock(&local->mtx);
+
ieee80211_mlme_notify_scan_completed(local);
ieee80211_ibss_notify_scan_completed(local);
ieee80211_mesh_notify_scan_completed(local);
ieee80211_queue_work(&local->hw, &local->work_work);
}
+
+void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ trace_api_scan_completed(local, aborted);
+
+ set_bit(SCAN_COMPLETED, &local->scanning);
+ if (aborted)
+ set_bit(SCAN_ABORTED, &local->scanning);
+ ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0);
+}
EXPORT_SYMBOL(ieee80211_scan_completed);
static int ieee80211_start_sw_scan(struct ieee80211_local *local)
@@ -353,6 +364,8 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
struct ieee80211_local *local = sdata->local;
int rc;
+ lockdep_assert_held(&local->mtx);
+
if (local->scan_req)
return -EBUSY;
@@ -434,8 +447,8 @@ ieee80211_scan_get_channel_time(struct ieee80211_channel *chan)
return IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME;
}
-static int ieee80211_scan_state_decision(struct ieee80211_local *local,
- unsigned long *next_delay)
+static void ieee80211_scan_state_decision(struct ieee80211_local *local,
+ unsigned long *next_delay)
{
bool associated = false;
bool tx_empty = true;
@@ -445,12 +458,6 @@ static int ieee80211_scan_state_decision(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata;
struct ieee80211_channel *next_chan;
- /* if no more bands/channels left, complete scan and advance to the idle state */
- if (local->scan_channel_idx >= local->scan_req->n_channels) {
- ieee80211_scan_completed(&local->hw, false);
- return 1;
- }
-
/*
* check if at least one STA interface is associated,
* check if at least one STA interface has pending tx frames
@@ -522,7 +529,6 @@ static int ieee80211_scan_state_decision(struct ieee80211_local *local,
}
*next_delay = 0;
- return 0;
}
static void ieee80211_scan_state_leave_oper_channel(struct ieee80211_local *local,
@@ -638,21 +644,18 @@ void ieee80211_scan_work(struct work_struct *work)
container_of(work, struct ieee80211_local, scan_work.work);
struct ieee80211_sub_if_data *sdata = local->scan_sdata;
unsigned long next_delay = 0;
+ bool aborted, hw_scan, finish;
- mutex_lock(&local->scan_mtx);
- if (!sdata || !local->scan_req) {
- mutex_unlock(&local->scan_mtx);
- return;
- }
+ mutex_lock(&local->mtx);
- if (local->hw_scan_req) {
- int rc = drv_hw_scan(local, sdata, local->hw_scan_req);
- mutex_unlock(&local->scan_mtx);
- if (rc)
- ieee80211_scan_completed(&local->hw, true);
- return;
+ if (test_and_clear_bit(SCAN_COMPLETED, &local->scanning)) {
+ aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning);
+ goto out_complete;
}
+ if (!sdata || !local->scan_req)
+ goto out;
+
if (local->scan_req && !local->scanning) {
struct cfg80211_scan_request *req = local->scan_req;
int rc;
@@ -661,21 +664,21 @@ void ieee80211_scan_work(struct work_struct *work)
local->scan_sdata = NULL;
rc = __ieee80211_start_scan(sdata, req);
- mutex_unlock(&local->scan_mtx);
-
- if (rc)
- ieee80211_scan_completed(&local->hw, true);
- return;
+ if (rc) {
+ /* need to complete scan in cfg80211 */
+ local->scan_req = req;
+ aborted = true;
+ goto out_complete;
+ } else
+ goto out;
}
- mutex_unlock(&local->scan_mtx);
-
/*
* Avoid re-scheduling when the sdata is going away.
*/
if (!ieee80211_sdata_running(sdata)) {
- ieee80211_scan_completed(&local->hw, true);
- return;
+ aborted = true;
+ goto out_complete;
}
/*
@@ -685,8 +688,12 @@ void ieee80211_scan_work(struct work_struct *work)
do {
switch (local->next_scan_state) {
case SCAN_DECISION:
- if (ieee80211_scan_state_decision(local, &next_delay))
- return;
+ /* if no more bands/channels left, complete scan */
+ if (local->scan_channel_idx >= local->scan_req->n_channels) {
+ aborted = false;
+ goto out_complete;
+ }
+ ieee80211_scan_state_decision(local, &next_delay);
break;
case SCAN_SET_CHANNEL:
ieee80211_scan_state_set_channel(local, &next_delay);
@@ -704,6 +711,19 @@ void ieee80211_scan_work(struct work_struct *work)
} while (next_delay == 0);
ieee80211_queue_delayed_work(&local->hw, &local->scan_work, next_delay);
+ mutex_unlock(&local->mtx);
+ return;
+
+out_complete:
+ hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning);
+ finish = __ieee80211_scan_completed(&local->hw, aborted, hw_scan);
+ mutex_unlock(&local->mtx);
+ if (finish)
+ __ieee80211_scan_completed_finish(&local->hw, hw_scan);
+ return;
+
+out:
+ mutex_unlock(&local->mtx);
}
int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
@@ -711,9 +731,9 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
{
int res;
- mutex_lock(&sdata->local->scan_mtx);
+ mutex_lock(&sdata->local->mtx);
res = __ieee80211_start_scan(sdata, req);
- mutex_unlock(&sdata->local->scan_mtx);
+ mutex_unlock(&sdata->local->mtx);
return res;
}
@@ -726,7 +746,7 @@ int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata,
int ret = -EBUSY;
enum ieee80211_band band;
- mutex_lock(&local->scan_mtx);
+ mutex_lock(&local->mtx);
/* busy scanning */
if (local->scan_req)
@@ -761,25 +781,44 @@ int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata,
ret = __ieee80211_start_scan(sdata, sdata->local->int_scan_req);
unlock:
- mutex_unlock(&local->scan_mtx);
+ mutex_unlock(&local->mtx);
return ret;
}
+/*
+ * Only call this function when a scan can't be queued -- under RTNL.
+ */
void ieee80211_scan_cancel(struct ieee80211_local *local)
{
bool abortscan;
-
- cancel_delayed_work_sync(&local->scan_work);
+ bool finish = false;
/*
- * Only call this function when a scan can't be
- * queued -- mostly at suspend under RTNL.
+ * We are only canceling software scan, or deferred scan that was not
+ * yet really started (see __ieee80211_start_scan ).
+ *
+ * Regarding hardware scan:
+ * - we can not call __ieee80211_scan_completed() as when
+ * SCAN_HW_SCANNING bit is set this function change
+ * local->hw_scan_req to operate on 5G band, what race with
+ * driver which can use local->hw_scan_req
+ *
+ * - we can not cancel scan_work since driver can schedule it
+ * by ieee80211_scan_completed(..., true) to finish scan
+ *
+ * Hence low lever driver is responsible for canceling HW scan.
*/
- mutex_lock(&local->scan_mtx);
- abortscan = test_bit(SCAN_SW_SCANNING, &local->scanning) ||
- (!local->scanning && local->scan_req);
- mutex_unlock(&local->scan_mtx);
+ mutex_lock(&local->mtx);
+ abortscan = local->scan_req && !test_bit(SCAN_HW_SCANNING, &local->scanning);
if (abortscan)
- ieee80211_scan_completed(&local->hw, true);
+ finish = __ieee80211_scan_completed(&local->hw, true, false);
+ mutex_unlock(&local->mtx);
+
+ if (abortscan) {
+ /* The scan is canceled, but stop work from being pending */
+ cancel_delayed_work_sync(&local->scan_work);
+ }
+ if (finish)
+ __ieee80211_scan_completed_finish(&local->hw, false);
}
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 6d86f0c1ad0..6d8f897d876 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -125,7 +125,7 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
lockdep_is_held(&local->sta_mtx));
while (sta) {
if ((sta->sdata == sdata ||
- sta->sdata->bss == sdata->bss) &&
+ (sta->sdata->bss && sta->sdata->bss == sdata->bss)) &&
memcmp(sta->sta.addr, addr, ETH_ALEN) == 0)
break;
sta = rcu_dereference_check(sta->hnext,
@@ -174,8 +174,7 @@ static void __sta_info_free(struct ieee80211_local *local,
}
#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
- printk(KERN_DEBUG "%s: Destroyed STA %pM\n",
- wiphy_name(local->hw.wiphy), sta->sta.addr);
+ wiphy_debug(local->hw.wiphy, "Destroyed STA %pM\n", sta->sta.addr);
#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
kfree(sta);
@@ -262,8 +261,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX);
#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
- printk(KERN_DEBUG "%s: Allocated STA %pM\n",
- wiphy_name(local->hw.wiphy), sta->sta.addr);
+ wiphy_debug(local->hw.wiphy, "Allocated STA %pM\n", sta->sta.addr);
#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
#ifdef CONFIG_MAC80211_MESH
@@ -282,7 +280,7 @@ static int sta_info_finish_insert(struct sta_info *sta, bool async)
unsigned long flags;
int err = 0;
- WARN_ON(!mutex_is_locked(&local->sta_mtx));
+ lockdep_assert_held(&local->sta_mtx);
/* notify driver */
if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
@@ -300,8 +298,9 @@ static int sta_info_finish_insert(struct sta_info *sta, bool async)
sta->uploaded = true;
#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
if (async)
- printk(KERN_DEBUG "%s: Finished adding IBSS STA %pM\n",
- wiphy_name(local->hw.wiphy), sta->sta.addr);
+ wiphy_debug(local->hw.wiphy,
+ "Finished adding IBSS STA %pM\n",
+ sta->sta.addr);
#endif
}
@@ -411,8 +410,8 @@ int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU)
spin_unlock_irqrestore(&local->sta_lock, flags);
#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
- printk(KERN_DEBUG "%s: Added IBSS STA %pM\n",
- wiphy_name(local->hw.wiphy), sta->sta.addr);
+ wiphy_debug(local->hw.wiphy, "Added IBSS STA %pM\n",
+ sta->sta.addr);
#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
ieee80211_queue_work(&local->hw, &local->sta_finish_work);
@@ -459,8 +458,7 @@ int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU)
}
#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
- printk(KERN_DEBUG "%s: Inserted STA %pM\n",
- wiphy_name(local->hw.wiphy), sta->sta.addr);
+ wiphy_debug(local->hw.wiphy, "Inserted STA %pM\n", sta->sta.addr);
#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
/* move reference to rcu-protected */
@@ -618,7 +616,7 @@ static int __must_check __sta_info_destroy(struct sta_info *sta)
struct ieee80211_sub_if_data *sdata;
struct sk_buff *skb;
unsigned long flags;
- int ret;
+ int ret, i;
might_sleep();
@@ -635,7 +633,7 @@ static int __must_check __sta_info_destroy(struct sta_info *sta)
* will be sufficient.
*/
set_sta_flags(sta, WLAN_STA_BLOCK_BA);
- ieee80211_sta_tear_down_BA_sessions(sta);
+ ieee80211_sta_tear_down_BA_sessions(sta, true);
spin_lock_irqsave(&local->sta_lock, flags);
ret = sta_info_hash_del(local, sta);
@@ -646,10 +644,10 @@ static int __must_check __sta_info_destroy(struct sta_info *sta)
if (ret)
return ret;
- if (sta->key) {
- ieee80211_key_free(local, sta->key);
- WARN_ON(sta->key);
- }
+ for (i = 0; i < NUM_DEFAULT_KEYS; i++)
+ ieee80211_key_free(local, sta->gtk[i]);
+ if (sta->ptk)
+ ieee80211_key_free(local, sta->ptk);
sta->dead = true;
@@ -690,8 +688,7 @@ static int __must_check __sta_info_destroy(struct sta_info *sta)
#endif
#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
- printk(KERN_DEBUG "%s: Removed STA %pM\n",
- wiphy_name(local->hw.wiphy), sta->sta.addr);
+ wiphy_debug(local->hw.wiphy, "Removed STA %pM\n", sta->sta.addr);
#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
cancel_work_sync(&sta->drv_unblock_wk);
@@ -841,13 +838,20 @@ void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
mutex_unlock(&local->sta_mtx);
}
-struct ieee80211_sta *ieee80211_find_sta_by_hw(struct ieee80211_hw *hw,
- const u8 *addr)
+struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw,
+ const u8 *addr,
+ const u8 *localaddr)
{
struct sta_info *sta, *nxt;
- /* Just return a random station ... first in list ... */
+ /*
+ * Just return a random station if localaddr is NULL
+ * ... first in list.
+ */
for_each_sta_info(hw_to_local(hw), addr, sta, nxt) {
+ if (localaddr &&
+ compare_ether_addr(sta->sdata->vif.addr, localaddr) != 0)
+ continue;
if (!sta->uploaded)
return NULL;
return &sta->sta;
@@ -855,7 +859,7 @@ struct ieee80211_sta *ieee80211_find_sta_by_hw(struct ieee80211_hw *hw,
return NULL;
}
-EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_hw);
+EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr);
struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif,
const u8 *addr)
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 54262e72376..9265acadef3 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -79,6 +79,7 @@ enum ieee80211_sta_info_flags {
* @dialog_token: dialog token for aggregation session
* @state: session state (see above)
* @stop_initiator: initiator of a session stop
+ * @tx_stop: TX DelBA frame when stopping
*
* This structure is protected by RCU and the per-station
* spinlock. Assignments to the array holding it must hold
@@ -95,6 +96,7 @@ struct tid_ampdu_tx {
unsigned long state;
u8 dialog_token;
u8 stop_initiator;
+ bool tx_stop;
};
/**
@@ -103,6 +105,7 @@ struct tid_ampdu_tx {
* @reorder_buf: buffer to reorder incoming aggregated MPDUs
* @reorder_time: jiffies when skb was added
* @session_timer: check if peer keeps Tx-ing on the TID (by timeout value)
+ * @reorder_timer: releases expired frames from the reorder buffer.
* @head_seq_num: head sequence number in reordering buffer.
* @stored_mpdu_num: number of MPDUs in reordering buffer
* @ssn: Starting Sequence Number expected to be aggregated.
@@ -110,20 +113,25 @@ struct tid_ampdu_tx {
* @timeout: reset timer value (in TUs).
* @dialog_token: dialog token for aggregation session
* @rcu_head: RCU head used for freeing this struct
+ * @reorder_lock: serializes access to reorder buffer, see below.
*
* This structure is protected by RCU and the per-station
* spinlock. Assignments to the array holding it must hold
- * the spinlock, only the RX path can access it under RCU
- * lock-free. The RX path, since it is single-threaded,
- * can even modify the structure without locking since the
- * only other modifications to it are done when the struct
- * can not yet or no longer be found by the RX path.
+ * the spinlock.
+ *
+ * The @reorder_lock is used to protect the variables and
+ * arrays such as @reorder_buf, @reorder_time, @head_seq_num,
+ * @stored_mpdu_num and @reorder_time from being corrupted by
+ * concurrent access of the RX path and the expired frame
+ * release timer.
*/
struct tid_ampdu_rx {
struct rcu_head rcu_head;
+ spinlock_t reorder_lock;
struct sk_buff **reorder_buf;
unsigned long *reorder_time;
struct timer_list session_timer;
+ struct timer_list reorder_timer;
u16 head_seq_num;
u16 stored_mpdu_num;
u16 ssn;
@@ -191,7 +199,8 @@ enum plink_state {
* @hnext: hash table linked list pointer
* @local: pointer to the global information
* @sdata: virtual interface this station belongs to
- * @key: peer key negotiated with this station, if any
+ * @ptk: peer key negotiated with this station, if any
+ * @gtk: group keys negotiated with this station, if any
* @rate_ctrl: rate control algorithm reference
* @rate_ctrl_priv: rate control private per-STA pointer
* @last_tx_rate: rate used for last transmit, to report to userspace as
@@ -246,7 +255,8 @@ struct sta_info {
struct sta_info *hnext;
struct ieee80211_local *local;
struct ieee80211_sub_if_data *sdata;
- struct ieee80211_key *key;
+ struct ieee80211_key *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS];
+ struct ieee80211_key *ptk;
struct rate_control_ref *rate_ctrl;
void *rate_ctrl_priv;
spinlock_t lock;
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 10caec5ea8f..3153c19893b 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -58,6 +58,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
info->control.vif = &sta->sdata->vif;
info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING |
IEEE80211_TX_INTFL_RETRANSMISSION;
+ info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS;
sta->tx_filtered_count++;
@@ -114,11 +115,10 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
if (net_ratelimit())
- printk(KERN_DEBUG "%s: dropped TX filtered frame, "
- "queue_len=%d PS=%d @%lu\n",
- wiphy_name(local->hw.wiphy),
- skb_queue_len(&sta->tx_filtered),
- !!test_sta_flags(sta, WLAN_STA_PS_STA), jiffies);
+ wiphy_debug(local->hw.wiphy,
+ "dropped TX filtered frame, queue_len=%d PS=%d @%lu\n",
+ skb_queue_len(&sta->tx_filtered),
+ !!test_sta_flags(sta, WLAN_STA_PS_STA), jiffies);
#endif
dev_kfree_skb(skb);
}
@@ -176,7 +176,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
/* the HW cannot have attempted that rate */
- if (i >= hw->max_rates) {
+ if (i >= hw->max_report_rates) {
info->status.rates[i].idx = -1;
info->status.rates[i].count = 0;
} else if (info->status.rates[i].idx >= 0) {
@@ -296,7 +296,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
}
if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX)
- cfg80211_action_tx_status(
+ cfg80211_mgmt_tx_status(
skb->dev, (unsigned long) skb, skb->data, skb->len,
!!(info->flags & IEEE80211_TX_STAT_ACK), GFP_ATOMIC);
@@ -377,7 +377,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2) {
skb2->dev = prev_dev;
- netif_receive_skb(skb2);
+ netif_rx(skb2);
}
}
@@ -386,7 +386,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
}
if (prev_dev) {
skb->dev = prev_dev;
- netif_receive_skb(skb);
+ netif_rx(skb);
skb = NULL;
}
rcu_read_unlock();
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index c54db966926..96c59430950 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -273,6 +273,9 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
*/
return TX_DROP;
+ if (tx->sdata->vif.type == NL80211_IFTYPE_WDS)
+ return TX_CONTINUE;
+
if (tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
return TX_CONTINUE;
@@ -351,8 +354,8 @@ static void purge_old_ps_buffers(struct ieee80211_local *local)
local->total_ps_buffered = total;
#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
- printk(KERN_DEBUG "%s: PS buffers full - purged %d frames\n",
- wiphy_name(local->hw.wiphy), purged);
+ wiphy_debug(local->hw.wiphy, "PS buffers full - purged %d frames\n",
+ purged);
#endif
}
@@ -509,6 +512,18 @@ ieee80211_tx_h_ps_buf(struct ieee80211_tx_data *tx)
}
static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_check_control_port_protocol(struct ieee80211_tx_data *tx)
+{
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+
+ if (unlikely(tx->sdata->control_port_protocol == tx->skb->protocol &&
+ tx->sdata->control_port_no_encrypt))
+ info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+
+ return TX_CONTINUE;
+}
+
+static ieee80211_tx_result debug_noinline
ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
{
struct ieee80211_key *key = NULL;
@@ -517,7 +532,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT))
tx->key = NULL;
- else if (tx->sta && (key = rcu_dereference(tx->sta->key)))
+ else if (tx->sta && (key = rcu_dereference(tx->sta->ptk)))
tx->key = key;
else if (ieee80211_is_mgmt(hdr->frame_control) &&
is_multicast_ether_addr(hdr->addr1) &&
@@ -527,7 +542,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
else if ((key = rcu_dereference(tx->sdata->default_key)))
tx->key = key;
else if (tx->sdata->drop_unencrypted &&
- (tx->skb->protocol != cpu_to_be16(ETH_P_PAE)) &&
+ (tx->skb->protocol != tx->sdata->control_port_protocol) &&
!(info->flags & IEEE80211_TX_CTL_INJECTED) &&
(!ieee80211_is_robust_mgmt_frame(hdr) ||
(ieee80211_is_action(hdr->frame_control) &&
@@ -543,15 +558,16 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
tx->key->tx_rx_count++;
/* TODO: add threshold stuff again */
- switch (tx->key->conf.alg) {
- case ALG_WEP:
+ switch (tx->key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
if (ieee80211_is_auth(hdr->frame_control))
break;
- case ALG_TKIP:
+ case WLAN_CIPHER_SUITE_TKIP:
if (!ieee80211_is_data_present(hdr->frame_control))
tx->key = NULL;
break;
- case ALG_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP:
if (!ieee80211_is_data_present(hdr->frame_control) &&
!ieee80211_use_mfp(hdr->frame_control, tx->sta,
tx->skb))
@@ -561,7 +577,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
IEEE80211_KEY_FLAG_SW_MGMT) &&
ieee80211_is_mgmt(hdr->frame_control);
break;
- case ALG_AES_CMAC:
+ case WLAN_CIPHER_SUITE_AES_CMAC:
if (!ieee80211_is_mgmt(hdr->frame_control))
tx->key = NULL;
break;
@@ -946,22 +962,31 @@ ieee80211_tx_h_stats(struct ieee80211_tx_data *tx)
static ieee80211_tx_result debug_noinline
ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx)
{
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+
if (!tx->key)
return TX_CONTINUE;
- switch (tx->key->conf.alg) {
- case ALG_WEP:
+ switch (tx->key->conf.cipher) {
+ case WLAN_CIPHER_SUITE_WEP40:
+ case WLAN_CIPHER_SUITE_WEP104:
return ieee80211_crypto_wep_encrypt(tx);
- case ALG_TKIP:
+ case WLAN_CIPHER_SUITE_TKIP:
return ieee80211_crypto_tkip_encrypt(tx);
- case ALG_CCMP:
+ case WLAN_CIPHER_SUITE_CCMP:
return ieee80211_crypto_ccmp_encrypt(tx);
- case ALG_AES_CMAC:
+ case WLAN_CIPHER_SUITE_AES_CMAC:
return ieee80211_crypto_aes_cmac_encrypt(tx);
+ default:
+ /* handle hw-only algorithm */
+ if (info->control.hw_key) {
+ ieee80211_tx_set_protected(tx);
+ return TX_CONTINUE;
+ }
+ break;
+
}
- /* not reached */
- WARN_ON(1);
return TX_DROP;
}
@@ -1339,6 +1364,7 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
CALL_TXH(ieee80211_tx_h_dynamic_ps);
CALL_TXH(ieee80211_tx_h_check_assoc);
CALL_TXH(ieee80211_tx_h_ps_buf);
+ CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
CALL_TXH(ieee80211_tx_h_select_key);
if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL))
CALL_TXH(ieee80211_tx_h_rate_ctrl);
@@ -1511,8 +1537,8 @@ static int ieee80211_skb_resize(struct ieee80211_local *local,
I802_DEBUG_INC(local->tx_expand_skb_head);
if (pskb_expand_head(skb, head_need, tail_need, GFP_ATOMIC)) {
- printk(KERN_DEBUG "%s: failed to reallocate TX buffer\n",
- wiphy_name(local->hw.wiphy));
+ wiphy_debug(local->hw.wiphy,
+ "failed to reallocate TX buffer\n");
return -ENOMEM;
}
@@ -1586,6 +1612,7 @@ static void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
return;
}
+ hdr = (struct ieee80211_hdr *) skb->data;
info->control.vif = &sdata->vif;
if (ieee80211_vif_is_mesh(&sdata->vif) &&
@@ -1699,7 +1726,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
u16 ethertype, hdrlen, meshhdrlen = 0;
__le16 fc;
struct ieee80211_hdr hdr;
- struct ieee80211s_hdr mesh_hdr;
+ struct ieee80211s_hdr mesh_hdr __maybe_unused;
const u8 *encaps_data;
int encaps_len, skip_header_bytes;
int nh_pos, h_pos;
@@ -1816,7 +1843,8 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
#endif
case NL80211_IFTYPE_STATION:
memcpy(hdr.addr1, sdata->u.mgd.bssid, ETH_ALEN);
- if (sdata->u.mgd.use_4addr && ethertype != ETH_P_PAE) {
+ if (sdata->u.mgd.use_4addr &&
+ cpu_to_be16(ethertype) != sdata->control_port_protocol) {
fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS);
/* RA TA DA SA */
memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN);
@@ -1869,7 +1897,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
if (!ieee80211_vif_is_mesh(&sdata->vif) &&
unlikely(!is_multicast_ether_addr(hdr.addr1) &&
!(sta_flags & WLAN_STA_AUTHORIZED) &&
- !(ethertype == ETH_P_PAE &&
+ !(cpu_to_be16(ethertype) == sdata->control_port_protocol &&
compare_ether_addr(sdata->vif.addr,
skb->data + ETH_ALEN) == 0))) {
#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
@@ -2068,8 +2096,7 @@ void ieee80211_tx_pending(unsigned long data)
if (skb_queue_empty(&local->pending[i]))
list_for_each_entry_rcu(sdata, &local->interfaces, list)
- netif_tx_wake_queue(
- netdev_get_tx_queue(sdata->dev, i));
+ netif_wake_subqueue(sdata->dev, i);
}
spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 748387d45bc..0b6fc92bc0d 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -283,8 +283,11 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
if (skb_queue_empty(&local->pending[queue])) {
rcu_read_lock();
- list_for_each_entry_rcu(sdata, &local->interfaces, list)
- netif_tx_wake_queue(netdev_get_tx_queue(sdata->dev, queue));
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state))
+ continue;
+ netif_wake_subqueue(sdata->dev, queue);
+ }
rcu_read_unlock();
} else
tasklet_schedule(&local->tx_pending_tasklet);
@@ -323,7 +326,7 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
rcu_read_lock();
list_for_each_entry_rcu(sdata, &local->interfaces, list)
- netif_tx_stop_queue(netdev_get_tx_queue(sdata->dev, queue));
+ netif_stop_subqueue(sdata->dev, queue);
rcu_read_unlock();
}
@@ -471,16 +474,10 @@ void ieee80211_iterate_active_interfaces(
list_for_each_entry(sdata, &local->interfaces, list) {
switch (sdata->vif.type) {
- case __NL80211_IFTYPE_AFTER_LAST:
- case NL80211_IFTYPE_UNSPECIFIED:
case NL80211_IFTYPE_MONITOR:
case NL80211_IFTYPE_AP_VLAN:
continue;
- case NL80211_IFTYPE_AP:
- case NL80211_IFTYPE_STATION:
- case NL80211_IFTYPE_ADHOC:
- case NL80211_IFTYPE_WDS:
- case NL80211_IFTYPE_MESH_POINT:
+ default:
break;
}
if (ieee80211_sdata_running(sdata))
@@ -505,16 +502,10 @@ void ieee80211_iterate_active_interfaces_atomic(
list_for_each_entry_rcu(sdata, &local->interfaces, list) {
switch (sdata->vif.type) {
- case __NL80211_IFTYPE_AFTER_LAST:
- case NL80211_IFTYPE_UNSPECIFIED:
case NL80211_IFTYPE_MONITOR:
case NL80211_IFTYPE_AP_VLAN:
continue;
- case NL80211_IFTYPE_AP:
- case NL80211_IFTYPE_STATION:
- case NL80211_IFTYPE_ADHOC:
- case NL80211_IFTYPE_WDS:
- case NL80211_IFTYPE_MESH_POINT:
+ default:
break;
}
if (ieee80211_sdata_running(sdata))
@@ -904,26 +895,34 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
const u8 *ie, size_t ie_len,
- enum ieee80211_band band)
+ enum ieee80211_band band, u32 rate_mask,
+ u8 channel)
{
struct ieee80211_supported_band *sband;
u8 *pos;
size_t offset = 0, noffset;
int supp_rates_len, i;
+ u8 rates[32];
+ int num_rates;
+ int ext_rates_len;
sband = local->hw.wiphy->bands[band];
pos = buffer;
- supp_rates_len = min_t(int, sband->n_bitrates, 8);
+ num_rates = 0;
+ for (i = 0; i < sband->n_bitrates; i++) {
+ if ((BIT(i) & rate_mask) == 0)
+ continue; /* skip rate */
+ rates[num_rates++] = (u8) (sband->bitrates[i].bitrate / 5);
+ }
+
+ supp_rates_len = min_t(int, num_rates, 8);
*pos++ = WLAN_EID_SUPP_RATES;
*pos++ = supp_rates_len;
-
- for (i = 0; i < supp_rates_len; i++) {
- int rate = sband->bitrates[i].bitrate;
- *pos++ = (u8) (rate / 5);
- }
+ memcpy(pos, rates, supp_rates_len);
+ pos += supp_rates_len;
/* insert "request information" if in custom IEs */
if (ie && ie_len) {
@@ -941,14 +940,18 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
offset = noffset;
}
- if (sband->n_bitrates > i) {
+ ext_rates_len = num_rates - supp_rates_len;
+ if (ext_rates_len > 0) {
*pos++ = WLAN_EID_EXT_SUPP_RATES;
- *pos++ = sband->n_bitrates - i;
+ *pos++ = ext_rates_len;
+ memcpy(pos, rates + supp_rates_len, ext_rates_len);
+ pos += ext_rates_len;
+ }
- for (; i < sband->n_bitrates; i++) {
- int rate = sband->bitrates[i].bitrate;
- *pos++ = (u8) (rate / 5);
- }
+ if (channel && sband->band == IEEE80211_BAND_2GHZ) {
+ *pos++ = WLAN_EID_DS_PARAMS;
+ *pos++ = 1;
+ *pos++ = channel;
}
/* insert custom IEs that go before HT */
@@ -1017,6 +1020,7 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
struct ieee80211_mgmt *mgmt;
size_t buf_len;
u8 *buf;
+ u8 chan;
/* FIXME: come up with a proper value */
buf = kmalloc(200 + ie_len, GFP_KERNEL);
@@ -1026,8 +1030,14 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
return;
}
+ chan = ieee80211_frequency_to_channel(
+ local->hw.conf.channel->center_freq);
+
buf_len = ieee80211_build_preq_ies(local, buf, ie, ie_len,
- local->hw.conf.channel->band);
+ local->hw.conf.channel->band,
+ sdata->rc_rateidx_mask
+ [local->hw.conf.channel->band],
+ chan);
skb = ieee80211_probereq_get(&local->hw, &sdata->vif,
ssid, ssid_len,
@@ -1189,7 +1199,9 @@ int ieee80211_reconfig(struct ieee80211_local *local)
/* ignore virtual */
break;
case NL80211_IFTYPE_UNSPECIFIED:
- case __NL80211_IFTYPE_AFTER_LAST:
+ case NUM_NL80211_IFTYPES:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ case NL80211_IFTYPE_P2P_GO:
WARN_ON(1);
break;
}
@@ -1209,7 +1221,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
mutex_lock(&local->sta_mtx);
list_for_each_entry(sta, &local->sta_list, list) {
- ieee80211_sta_tear_down_BA_sessions(sta);
+ ieee80211_sta_tear_down_BA_sessions(sta, true);
clear_sta_flags(sta, WLAN_STA_BLOCK_BA);
}
@@ -1285,17 +1297,13 @@ static int check_mgd_smps(struct ieee80211_if_managed *ifmgd,
}
/* must hold iflist_mtx */
-void ieee80211_recalc_smps(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *forsdata)
+void ieee80211_recalc_smps(struct ieee80211_local *local)
{
struct ieee80211_sub_if_data *sdata;
enum ieee80211_smps_mode smps_mode = IEEE80211_SMPS_OFF;
int count = 0;
- if (forsdata)
- WARN_ON(!mutex_is_locked(&forsdata->u.mgd.mtx));
-
- WARN_ON(!mutex_is_locked(&local->iflist_mtx));
+ lockdep_assert_held(&local->iflist_mtx);
/*
* This function could be improved to handle multiple
@@ -1308,22 +1316,12 @@ void ieee80211_recalc_smps(struct ieee80211_local *local,
*/
list_for_each_entry(sdata, &local->interfaces, list) {
- if (!netif_running(sdata->dev))
+ if (!ieee80211_sdata_running(sdata))
continue;
if (sdata->vif.type != NL80211_IFTYPE_STATION)
goto set;
- if (sdata != forsdata) {
- /*
- * This nested is ok -- we are holding the iflist_mtx
- * so can't get here twice or so. But it's required
- * since normally we acquire it first and then the
- * iflist_mtx.
- */
- mutex_lock_nested(&sdata->u.mgd.mtx, SINGLE_DEPTH_NESTING);
- count += check_mgd_smps(&sdata->u.mgd, &smps_mode);
- mutex_unlock(&sdata->u.mgd.mtx);
- } else
- count += check_mgd_smps(&sdata->u.mgd, &smps_mode);
+
+ count += check_mgd_smps(&sdata->u.mgd, &smps_mode);
if (count > 1) {
smps_mode = IEEE80211_SMPS_OFF;
diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
index 9ebc8d8a1f5..2ff6d1e3ed2 100644
--- a/net/mac80211/wep.c
+++ b/net/mac80211/wep.c
@@ -222,7 +222,7 @@ static int ieee80211_wep_decrypt(struct ieee80211_local *local,
struct ieee80211_key *key)
{
u32 klen;
- u8 *rc4key;
+ u8 rc4key[3 + WLAN_KEY_LEN_WEP104];
u8 keyidx;
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
unsigned int hdrlen;
@@ -240,15 +240,11 @@ static int ieee80211_wep_decrypt(struct ieee80211_local *local,
keyidx = skb->data[hdrlen + 3] >> 6;
- if (!key || keyidx != key->conf.keyidx || key->conf.alg != ALG_WEP)
+ if (!key || keyidx != key->conf.keyidx)
return -1;
klen = 3 + key->conf.keylen;
- rc4key = kmalloc(klen, GFP_ATOMIC);
- if (!rc4key)
- return -1;
-
/* Prepend 24-bit IV to RC4 key */
memcpy(rc4key, skb->data + hdrlen, 3);
@@ -260,8 +256,6 @@ static int ieee80211_wep_decrypt(struct ieee80211_local *local,
len))
ret = -1;
- kfree(rc4key);
-
/* Trim ICV */
skb_trim(skb, skb->len - WEP_ICV_LEN);
diff --git a/net/mac80211/work.c b/net/mac80211/work.c
index 81d4ad64184..ae344d1ba05 100644
--- a/net/mac80211/work.c
+++ b/net/mac80211/work.c
@@ -43,7 +43,7 @@ enum work_action {
/* utils */
static inline void ASSERT_WORK_MTX(struct ieee80211_local *local)
{
- WARN_ON(!mutex_is_locked(&local->work_mtx));
+ lockdep_assert_held(&local->mtx);
}
/*
@@ -757,7 +757,7 @@ static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local,
mgmt = (struct ieee80211_mgmt *) skb->data;
fc = le16_to_cpu(mgmt->frame_control);
- mutex_lock(&local->work_mtx);
+ mutex_lock(&local->mtx);
list_for_each_entry(wk, &local->work_list, list) {
const u8 *bssid = NULL;
@@ -833,7 +833,7 @@ static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local,
WARN(1, "unexpected: %d", rma);
}
- mutex_unlock(&local->work_mtx);
+ mutex_unlock(&local->mtx);
if (rma != WORK_ACT_DONE)
goto out;
@@ -845,9 +845,9 @@ static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local,
case WORK_DONE_REQUEUE:
synchronize_rcu();
wk->started = false; /* restart */
- mutex_lock(&local->work_mtx);
+ mutex_lock(&local->mtx);
list_add_tail(&wk->list, &local->work_list);
- mutex_unlock(&local->work_mtx);
+ mutex_unlock(&local->mtx);
}
out:
@@ -888,9 +888,9 @@ static void ieee80211_work_work(struct work_struct *work)
while ((skb = skb_dequeue(&local->work_skb_queue)))
ieee80211_work_rx_queued_mgmt(local, skb);
- ieee80211_recalc_idle(local);
+ mutex_lock(&local->mtx);
- mutex_lock(&local->work_mtx);
+ ieee80211_recalc_idle(local);
list_for_each_entry_safe(wk, tmp, &local->work_list, list) {
bool started = wk->started;
@@ -995,20 +995,16 @@ static void ieee80211_work_work(struct work_struct *work)
run_again(local, jiffies + HZ/2);
}
- mutex_lock(&local->scan_mtx);
-
if (list_empty(&local->work_list) && local->scan_req &&
!local->scanning)
ieee80211_queue_delayed_work(&local->hw,
&local->scan_work,
round_jiffies_relative(0));
- mutex_unlock(&local->scan_mtx);
-
- mutex_unlock(&local->work_mtx);
-
ieee80211_recalc_idle(local);
+ mutex_unlock(&local->mtx);
+
list_for_each_entry_safe(wk, tmp, &free_work, list) {
wk->done(wk, NULL);
list_del(&wk->list);
@@ -1035,16 +1031,15 @@ void ieee80211_add_work(struct ieee80211_work *wk)
wk->started = false;
local = wk->sdata->local;
- mutex_lock(&local->work_mtx);
+ mutex_lock(&local->mtx);
list_add_tail(&wk->list, &local->work_list);
- mutex_unlock(&local->work_mtx);
+ mutex_unlock(&local->mtx);
ieee80211_queue_work(&local->hw, &local->work_work);
}
void ieee80211_work_init(struct ieee80211_local *local)
{
- mutex_init(&local->work_mtx);
INIT_LIST_HEAD(&local->work_list);
setup_timer(&local->work_timer, ieee80211_work_timer,
(unsigned long)local);
@@ -1057,7 +1052,7 @@ void ieee80211_work_purge(struct ieee80211_sub_if_data *sdata)
struct ieee80211_local *local = sdata->local;
struct ieee80211_work *wk;
- mutex_lock(&local->work_mtx);
+ mutex_lock(&local->mtx);
list_for_each_entry(wk, &local->work_list, list) {
if (wk->sdata != sdata)
continue;
@@ -1065,19 +1060,19 @@ void ieee80211_work_purge(struct ieee80211_sub_if_data *sdata)
wk->started = true;
wk->timeout = jiffies;
}
- mutex_unlock(&local->work_mtx);
+ mutex_unlock(&local->mtx);
/* run cleanups etc. */
ieee80211_work_work(&local->work_work);
- mutex_lock(&local->work_mtx);
+ mutex_lock(&local->mtx);
list_for_each_entry(wk, &local->work_list, list) {
if (wk->sdata != sdata)
continue;
WARN_ON(1);
break;
}
- mutex_unlock(&local->work_mtx);
+ mutex_unlock(&local->mtx);
}
ieee80211_rx_result ieee80211_work_rx_mgmt(struct ieee80211_sub_if_data *sdata,
@@ -1163,7 +1158,7 @@ int ieee80211_wk_cancel_remain_on_channel(struct ieee80211_sub_if_data *sdata,
struct ieee80211_work *wk, *tmp;
bool found = false;
- mutex_lock(&local->work_mtx);
+ mutex_lock(&local->mtx);
list_for_each_entry_safe(wk, tmp, &local->work_list, list) {
if ((unsigned long) wk == cookie) {
wk->timeout = jiffies;
@@ -1171,7 +1166,7 @@ int ieee80211_wk_cancel_remain_on_channel(struct ieee80211_sub_if_data *sdata,
break;
}
}
- mutex_unlock(&local->work_mtx);
+ mutex_unlock(&local->mtx);
if (!found)
return -ENOENT;
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 8d59d27d887..bee230d8fd1 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -36,8 +36,8 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
int tail;
hdr = (struct ieee80211_hdr *)skb->data;
- if (!tx->key || tx->key->conf.alg != ALG_TKIP || skb->len < 24 ||
- !ieee80211_is_data_present(hdr->frame_control))
+ if (!tx->key || tx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP ||
+ skb->len < 24 || !ieee80211_is_data_present(hdr->frame_control))
return TX_CONTINUE;
hdrlen = ieee80211_hdrlen(hdr->frame_control);
@@ -94,7 +94,7 @@ ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
if (status->flag & RX_FLAG_MMIC_STRIPPED)
return RX_CONTINUE;
- if (!rx->key || rx->key->conf.alg != ALG_TKIP ||
+ if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP ||
!ieee80211_has_protected(hdr->frame_control) ||
!ieee80211_is_data_present(hdr->frame_control))
return RX_CONTINUE;
@@ -117,7 +117,7 @@ ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
key = &rx->key->conf.key[key_offset];
michael_mic(key, hdr, data, data_len, mic);
if (memcmp(mic, data + data_len, MICHAEL_MIC_LEN) != 0 || wpa_test) {
- if (!(rx->flags & IEEE80211_RX_RA_MATCH))
+ if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
return RX_DROP_UNUSABLE;
mac80211_ev_michael_mic_failure(rx->sdata, rx->key->conf.keyidx,
@@ -221,19 +221,13 @@ ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx)
if (!rx->sta || skb->len - hdrlen < 12)
return RX_DROP_UNUSABLE;
- if (status->flag & RX_FLAG_DECRYPTED) {
- if (status->flag & RX_FLAG_IV_STRIPPED) {
- /*
- * Hardware took care of all processing, including
- * replay protection, and stripped the ICV/IV so
- * we cannot do any checks here.
- */
- return RX_CONTINUE;
- }
-
- /* let TKIP code verify IV, but skip decryption */
+ /*
+ * Let TKIP code verify IV, but skip decryption.
+ * In the case where hardware checks the IV as well,
+ * we don't even get here, see ieee80211_rx_h_decrypt()
+ */
+ if (status->flag & RX_FLAG_DECRYPTED)
hwaccel = 1;
- }
res = ieee80211_tkip_decrypt_data(rx->local->wep_rx_tfm,
key, skb->data + hdrlen,
@@ -447,10 +441,6 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx)
if (!rx->sta || data_len < 0)
return RX_DROP_UNUSABLE;
- if ((status->flag & RX_FLAG_DECRYPTED) &&
- (status->flag & RX_FLAG_IV_STRIPPED))
- return RX_CONTINUE;
-
ccmp_hdr2pn(pn, skb->data + hdrlen);
queue = ieee80211_is_mgmt(hdr->frame_control) ?
@@ -564,10 +554,6 @@ ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
if (!ieee80211_is_mgmt(hdr->frame_control))
return RX_CONTINUE;
- if ((status->flag & RX_FLAG_DECRYPTED) &&
- (status->flag & RX_FLAG_IV_STRIPPED))
- return RX_CONTINUE;
-
if (skb->len < 24 + sizeof(*mmie))
return RX_DROP_UNUSABLE;
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 78b505d33bf..85dabb86be6 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -27,7 +27,7 @@
static DEFINE_MUTEX(afinfo_mutex);
-const struct nf_afinfo *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
+const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
EXPORT_SYMBOL(nf_afinfo);
int nf_register_afinfo(const struct nf_afinfo *afinfo)
@@ -105,10 +105,8 @@ EXPORT_SYMBOL(nf_register_hooks);
void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
{
- unsigned int i;
-
- for (i = 0; i < n; i++)
- nf_unregister_hook(&reg[i]);
+ while (n-- > 0)
+ nf_unregister_hook(&reg[n]);
}
EXPORT_SYMBOL(nf_unregister_hooks);
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 46a77d5c388..a22dac22705 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -3,7 +3,7 @@
#
menuconfig IP_VS
tristate "IP virtual server support"
- depends on NET && INET && NETFILTER && NF_CONNTRACK
+ depends on NET && INET && NETFILTER
---help---
IP Virtual Server support will let you build a high-performance
virtual server based on cluster of two or more real servers. This
@@ -235,7 +235,8 @@ comment 'IPVS application helper'
config IP_VS_FTP
tristate "FTP protocol helper"
- depends on IP_VS_PROTO_TCP && NF_NAT
+ depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT
+ select IP_VS_NFCT
---help---
FTP is a protocol that transfers IP address and/or port number in
the payload. In the virtual server via Network Address Translation,
@@ -247,4 +248,19 @@ config IP_VS_FTP
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
+config IP_VS_NFCT
+ bool "Netfilter connection tracking"
+ depends on NF_CONNTRACK
+ ---help---
+ The Netfilter connection tracking support allows the IPVS
+ connection state to be exported to the Netfilter framework
+ for filtering purposes.
+
+config IP_VS_PE_SIP
+ tristate "SIP persistence engine"
+ depends on IP_VS_PROTO_UDP
+ depends on NF_CONNTRACK_SIP
+ ---help---
+ Allow persistence based on the SIP Call-ID
+
endif # IP_VS
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index e3baefd7066..34ee602ddb6 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -9,10 +9,13 @@ ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o
+ip_vs-extra_objs-y :=
+ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o
+
ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
- ip_vs_est.o ip_vs_proto.o \
- $(ip_vs_proto-objs-y)
+ ip_vs_est.o ip_vs_proto.o ip_vs_pe.o \
+ $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y)
# IPVS core
@@ -32,3 +35,6 @@ obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
# IPVS application helpers
obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
+
+# IPVS connection template retrievers
+obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index e76f87f4aca..a475edee091 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -103,8 +103,8 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
goto out;
list_add(&inc->a_list, &app->incs_list);
- IP_VS_DBG(9, "%s application %s:%u registered\n",
- pp->name, inc->name, inc->port);
+ IP_VS_DBG(9, "%s App %s:%u registered\n",
+ pp->name, inc->name, ntohs(inc->port));
return 0;
@@ -130,7 +130,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc)
pp->unregister_app(inc);
IP_VS_DBG(9, "%s App %s:%u unregistered\n",
- pp->name, inc->name, inc->port);
+ pp->name, inc->name, ntohs(inc->port));
list_del(&inc->a_list);
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index b71c69a2db1..e9adecdc8ca 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -148,6 +148,42 @@ static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
& ip_vs_conn_tab_mask;
}
+static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
+ bool inverse)
+{
+ const union nf_inet_addr *addr;
+ __be16 port;
+
+ if (p->pe_data && p->pe->hashkey_raw)
+ return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
+ ip_vs_conn_tab_mask;
+
+ if (likely(!inverse)) {
+ addr = p->caddr;
+ port = p->cport;
+ } else {
+ addr = p->vaddr;
+ port = p->vport;
+ }
+
+ return ip_vs_conn_hashkey(p->af, p->protocol, addr, port);
+}
+
+static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
+{
+ struct ip_vs_conn_param p;
+
+ ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport,
+ NULL, 0, &p);
+
+ if (cp->dest && cp->dest->svc->pe) {
+ p.pe = cp->dest->svc->pe;
+ p.pe_data = cp->pe_data;
+ p.pe_data_len = cp->pe_data_len;
+ }
+
+ return ip_vs_conn_hashkey_param(&p, false);
+}
/*
* Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
@@ -162,7 +198,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
return 0;
/* Hash by protocol, client address and port */
- hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+ hash = ip_vs_conn_hashkey_conn(cp);
ct_write_lock(hash);
spin_lock(&cp->lock);
@@ -195,7 +231,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
int ret;
/* unhash it and decrease its reference counter */
- hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+ hash = ip_vs_conn_hashkey_conn(cp);
ct_write_lock(hash);
spin_lock(&cp->lock);
@@ -218,27 +254,26 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
/*
* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
* Called for pkts coming from OUTside-to-INside.
- * s_addr, s_port: pkt source address (foreign host)
- * d_addr, d_port: pkt dest address (load balancer)
+ * p->caddr, p->cport: pkt source address (foreign host)
+ * p->vaddr, p->vport: pkt dest address (load balancer)
*/
-static inline struct ip_vs_conn *__ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+static inline struct ip_vs_conn *
+__ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
{
unsigned hash;
struct ip_vs_conn *cp;
- hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
+ hash = ip_vs_conn_hashkey_param(p, false);
ct_read_lock(hash);
list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (cp->af == af &&
- ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
- ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
- s_port == cp->cport && d_port == cp->vport &&
- ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
- protocol == cp->protocol) {
+ if (cp->af == p->af &&
+ ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
+ ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
+ p->cport == cp->cport && p->vport == cp->vport &&
+ ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
+ p->protocol == cp->protocol) {
/* HIT */
atomic_inc(&cp->refcnt);
ct_read_unlock(hash);
@@ -251,99 +286,111 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get
return NULL;
}
-struct ip_vs_conn *ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
{
struct ip_vs_conn *cp;
- cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port);
- if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
- cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr,
- d_port);
+ cp = __ip_vs_conn_in_get(p);
+ if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
+ struct ip_vs_conn_param cport_zero_p = *p;
+ cport_zero_p.cport = 0;
+ cp = __ip_vs_conn_in_get(&cport_zero_p);
+ }
IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
- ip_vs_proto_name(protocol),
- IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
- IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
cp ? "hit" : "not hit");
return cp;
}
+static int
+ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph,
+ unsigned int proto_off, int inverse,
+ struct ip_vs_conn_param *p)
+{
+ __be16 _ports[2], *pptr;
+
+ pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+ if (pptr == NULL)
+ return 1;
+
+ if (likely(!inverse))
+ ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0],
+ &iph->daddr, pptr[1], p);
+ else
+ ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1],
+ &iph->saddr, pptr[0], p);
+ return 0;
+}
+
struct ip_vs_conn *
ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
struct ip_vs_protocol *pp,
const struct ip_vs_iphdr *iph,
unsigned int proto_off, int inverse)
{
- __be16 _ports[2], *pptr;
+ struct ip_vs_conn_param p;
- pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
- if (pptr == NULL)
+ if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
return NULL;
- if (likely(!inverse))
- return ip_vs_conn_in_get(af, iph->protocol,
- &iph->saddr, pptr[0],
- &iph->daddr, pptr[1]);
- else
- return ip_vs_conn_in_get(af, iph->protocol,
- &iph->daddr, pptr[1],
- &iph->saddr, pptr[0]);
+ return ip_vs_conn_in_get(&p);
}
EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
/* Get reference to connection template */
-struct ip_vs_conn *ip_vs_ct_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
{
unsigned hash;
struct ip_vs_conn *cp;
- hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
+ hash = ip_vs_conn_hashkey_param(p, false);
ct_read_lock(hash);
list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (cp->af == af &&
- ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
+ if (p->pe_data && p->pe->ct_match) {
+ if (p->pe->ct_match(p, cp))
+ goto out;
+ continue;
+ }
+
+ if (cp->af == p->af &&
+ ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
/* protocol should only be IPPROTO_IP if
- * d_addr is a fwmark */
- ip_vs_addr_equal(protocol == IPPROTO_IP ? AF_UNSPEC : af,
- d_addr, &cp->vaddr) &&
- s_port == cp->cport && d_port == cp->vport &&
+ * p->vaddr is a fwmark */
+ ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
+ p->af, p->vaddr, &cp->vaddr) &&
+ p->cport == cp->cport && p->vport == cp->vport &&
cp->flags & IP_VS_CONN_F_TEMPLATE &&
- protocol == cp->protocol) {
- /* HIT */
- atomic_inc(&cp->refcnt);
+ p->protocol == cp->protocol)
goto out;
- }
}
cp = NULL;
out:
+ if (cp)
+ atomic_inc(&cp->refcnt);
ct_read_unlock(hash);
IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
- ip_vs_proto_name(protocol),
- IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
- IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
cp ? "hit" : "not hit");
return cp;
}
-/*
- * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
- * Called for pkts coming from inside-to-OUTside.
- * s_addr, s_port: pkt source address (inside host)
- * d_addr, d_port: pkt dest address (foreign host)
- */
-struct ip_vs_conn *ip_vs_conn_out_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ * Called for pkts coming from inside-to-OUTside.
+ * p->caddr, p->cport: pkt source address (inside host)
+ * p->vaddr, p->vport: pkt dest address (foreign host) */
+struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
{
unsigned hash;
struct ip_vs_conn *cp, *ret=NULL;
@@ -351,16 +398,16 @@ struct ip_vs_conn *ip_vs_conn_out_get
/*
* Check for "full" addressed entries
*/
- hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
+ hash = ip_vs_conn_hashkey_param(p, true);
ct_read_lock(hash);
list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (cp->af == af &&
- ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
- ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
- d_port == cp->cport && s_port == cp->dport &&
- protocol == cp->protocol) {
+ if (cp->af == p->af &&
+ ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
+ ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
+ p->vport == cp->cport && p->cport == cp->dport &&
+ p->protocol == cp->protocol) {
/* HIT */
atomic_inc(&cp->refcnt);
ret = cp;
@@ -371,9 +418,9 @@ struct ip_vs_conn *ip_vs_conn_out_get
ct_read_unlock(hash);
IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
- ip_vs_proto_name(protocol),
- IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
- IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
ret ? "hit" : "not hit");
return ret;
@@ -385,20 +432,12 @@ ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
const struct ip_vs_iphdr *iph,
unsigned int proto_off, int inverse)
{
- __be16 _ports[2], *pptr;
+ struct ip_vs_conn_param p;
- pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
- if (pptr == NULL)
+ if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
return NULL;
- if (likely(!inverse))
- return ip_vs_conn_out_get(af, iph->protocol,
- &iph->saddr, pptr[0],
- &iph->daddr, pptr[1]);
- else
- return ip_vs_conn_out_get(af, iph->protocol,
- &iph->daddr, pptr[1],
- &iph->saddr, pptr[0]);
+ return ip_vs_conn_out_get(&p);
}
EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
@@ -505,6 +544,8 @@ static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
static inline void
ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
{
+ unsigned int conn_flags;
+
/* if dest is NULL, then return directly */
if (!dest)
return;
@@ -512,16 +553,20 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
/* Increase the refcnt counter of the dest */
atomic_inc(&dest->refcnt);
+ conn_flags = atomic_read(&dest->conn_flags);
+ if (cp->protocol != IPPROTO_UDP)
+ conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
/* Bind with the destination and its corresponding transmitter */
- if ((cp->flags & IP_VS_CONN_F_SYNC) &&
- (!(cp->flags & IP_VS_CONN_F_TEMPLATE)))
+ if (cp->flags & IP_VS_CONN_F_SYNC) {
/* if the connection is not template and is created
* by sync, preserve the activity flag.
*/
- cp->flags |= atomic_read(&dest->conn_flags) &
- (~IP_VS_CONN_F_INACTIVE);
- else
- cp->flags |= atomic_read(&dest->conn_flags);
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE))
+ conn_flags &= ~IP_VS_CONN_F_INACTIVE;
+ /* connections inherit forwarding method from dest */
+ cp->flags &= ~IP_VS_CONN_F_FWD_MASK;
+ }
+ cp->flags |= conn_flags;
cp->dest = dest;
IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
@@ -717,6 +762,10 @@ static void ip_vs_conn_expire(unsigned long data)
if (cp->control)
ip_vs_control_del(cp);
+ if (cp->flags & IP_VS_CONN_F_NFCT)
+ ip_vs_conn_drop_conntrack(cp);
+
+ kfree(cp->pe_data);
if (unlikely(cp->app != NULL))
ip_vs_unbind_app(cp);
ip_vs_unbind_dest(cp);
@@ -751,13 +800,12 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
* Create a new connection entry and hash it into the ip_vs_conn_tab
*/
struct ip_vs_conn *
-ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
- const union nf_inet_addr *vaddr, __be16 vport,
+ip_vs_conn_new(const struct ip_vs_conn_param *p,
const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
struct ip_vs_dest *dest)
{
struct ip_vs_conn *cp;
- struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+ struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol);
cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
if (cp == NULL) {
@@ -767,17 +815,21 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
INIT_LIST_HEAD(&cp->c_list);
setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
- cp->af = af;
- cp->protocol = proto;
- ip_vs_addr_copy(af, &cp->caddr, caddr);
- cp->cport = cport;
- ip_vs_addr_copy(af, &cp->vaddr, vaddr);
- cp->vport = vport;
+ cp->af = p->af;
+ cp->protocol = p->protocol;
+ ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
+ cp->cport = p->cport;
+ ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr);
+ cp->vport = p->vport;
/* proto should only be IPPROTO_IP if d_addr is a fwmark */
- ip_vs_addr_copy(proto == IPPROTO_IP ? AF_UNSPEC : af,
+ ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
&cp->daddr, daddr);
cp->dport = dport;
cp->flags = flags;
+ if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) {
+ cp->pe_data = p->pe_data;
+ cp->pe_data_len = p->pe_data_len;
+ }
spin_lock_init(&cp->lock);
/*
@@ -803,7 +855,7 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
/* Bind its packet transmitter */
#ifdef CONFIG_IP_VS_IPV6
- if (af == AF_INET6)
+ if (p->af == AF_INET6)
ip_vs_bind_xmit_v6(cp);
else
#endif
@@ -812,13 +864,22 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
if (unlikely(pp && atomic_read(&pp->appcnt)))
ip_vs_bind_app(cp, pp);
+ /*
+ * Allow conntrack to be preserved. By default, conntrack
+ * is created and destroyed for every packet.
+ * Sometimes keeping conntrack can be useful for
+ * IP_VS_CONN_F_ONE_PACKET too.
+ */
+
+ if (ip_vs_conntrack_enabled())
+ cp->flags |= IP_VS_CONN_F_NFCT;
+
/* Hash it in the ip_vs_conn_tab finally */
ip_vs_conn_hash(cp);
return cp;
}
-
/*
* /proc/net/ip_vs_conn entries
*/
@@ -834,7 +895,7 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
if (pos-- == 0) {
seq->private = &ip_vs_conn_tab[idx];
- return cp;
+ return cp;
}
}
ct_read_unlock_bh(idx);
@@ -891,30 +952,45 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
if (v == SEQ_START_TOKEN)
seq_puts(seq,
- "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n");
+ "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n");
else {
const struct ip_vs_conn *cp = v;
+ char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
+ size_t len = 0;
+
+ if (cp->dest && cp->pe_data &&
+ cp->dest->svc->pe->show_pe_data) {
+ pe_data[0] = ' ';
+ len = strlen(cp->dest->svc->pe->name);
+ memcpy(pe_data + 1, cp->dest->svc->pe->name, len);
+ pe_data[len + 1] = ' ';
+ len += 2;
+ len += cp->dest->svc->pe->show_pe_data(cp,
+ pe_data + len);
+ }
+ pe_data[len] = '\0';
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
- seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %7lu\n",
+ seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
+ "%pI6 %04X %-11s %7lu%s\n",
ip_vs_proto_name(cp->protocol),
&cp->caddr.in6, ntohs(cp->cport),
&cp->vaddr.in6, ntohs(cp->vport),
&cp->daddr.in6, ntohs(cp->dport),
ip_vs_state_name(cp->protocol, cp->state),
- (cp->timer.expires-jiffies)/HZ);
+ (cp->timer.expires-jiffies)/HZ, pe_data);
else
#endif
seq_printf(seq,
"%-3s %08X %04X %08X %04X"
- " %08X %04X %-11s %7lu\n",
+ " %08X %04X %-11s %7lu%s\n",
ip_vs_proto_name(cp->protocol),
ntohl(cp->caddr.ip), ntohs(cp->cport),
ntohl(cp->vaddr.ip), ntohs(cp->vport),
ntohl(cp->daddr.ip), ntohs(cp->dport),
ip_vs_state_name(cp->protocol, cp->state),
- (cp->timer.expires-jiffies)/HZ);
+ (cp->timer.expires-jiffies)/HZ, pe_data);
}
return 0;
}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 4f8ddba4801..b4e51e9c5a0 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -40,6 +40,7 @@
#include <net/udp.h>
#include <net/icmp.h> /* for icmp_send */
#include <net/route.h>
+#include <net/ip6_checksum.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
@@ -47,6 +48,7 @@
#ifdef CONFIG_IP_VS_IPV6
#include <net/ipv6.h>
#include <linux/netfilter_ipv6.h>
+#include <net/ip6_route.h>
#endif
#include <net/ip_vs.h>
@@ -175,6 +177,18 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
return pp->state_transition(cp, direction, skb, pp);
}
+static inline void
+ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
+ struct sk_buff *skb, int protocol,
+ const union nf_inet_addr *caddr, __be16 cport,
+ const union nf_inet_addr *vaddr, __be16 vport,
+ struct ip_vs_conn_param *p)
+{
+ ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
+ p->pe = svc->pe;
+ if (p->pe && p->pe->fill_param)
+ p->pe->fill_param(p, skb);
+}
/*
* IPVS persistent scheduling function
@@ -185,15 +199,16 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
*/
static struct ip_vs_conn *
ip_vs_sched_persist(struct ip_vs_service *svc,
- const struct sk_buff *skb,
+ struct sk_buff *skb,
__be16 ports[2])
{
struct ip_vs_conn *cp = NULL;
struct ip_vs_iphdr iph;
struct ip_vs_dest *dest;
struct ip_vs_conn *ct;
- __be16 dport; /* destination port to forward */
- __be16 flags;
+ __be16 dport = 0; /* destination port to forward */
+ unsigned int flags;
+ struct ip_vs_conn_param param;
union nf_inet_addr snet; /* source network of the client,
after masking */
@@ -226,120 +241,75 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
* is created for other persistent services.
*/
- if (ports[1] == svc->port) {
- /* Check if a template already exists */
- if (svc->port != FTPPORT)
- ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
- &iph.daddr, ports[1]);
- else
- ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
- &iph.daddr, 0);
-
- if (!ct || !ip_vs_check_template(ct)) {
- /*
- * No template found or the dest of the connection
- * template is not available.
- */
- dest = svc->scheduler->schedule(svc, skb);
- if (dest == NULL) {
- IP_VS_DBG(1, "p-schedule: no dest found.\n");
- return NULL;
- }
-
- /*
- * Create a template like <protocol,caddr,0,
- * vaddr,vport,daddr,dport> for non-ftp service,
- * and <protocol,caddr,0,vaddr,0,daddr,0>
- * for ftp service.
+ {
+ int protocol = iph.protocol;
+ const union nf_inet_addr *vaddr = &iph.daddr;
+ const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
+ __be16 vport = 0;
+
+ if (ports[1] == svc->port) {
+ /* non-FTP template:
+ * <protocol, caddr, 0, vaddr, vport, daddr, dport>
+ * FTP template:
+ * <protocol, caddr, 0, vaddr, 0, daddr, 0>
*/
if (svc->port != FTPPORT)
- ct = ip_vs_conn_new(svc->af, iph.protocol,
- &snet, 0,
- &iph.daddr,
- ports[1],
- &dest->addr, dest->port,
- IP_VS_CONN_F_TEMPLATE,
- dest);
- else
- ct = ip_vs_conn_new(svc->af, iph.protocol,
- &snet, 0,
- &iph.daddr, 0,
- &dest->addr, 0,
- IP_VS_CONN_F_TEMPLATE,
- dest);
- if (ct == NULL)
- return NULL;
-
- ct->timeout = svc->timeout;
+ vport = ports[1];
} else {
- /* set destination with the found template */
- dest = ct->dest;
- }
- dport = dest->port;
- } else {
- /*
- * Note: persistent fwmark-based services and persistent
- * port zero service are handled here.
- * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
- * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
- */
- if (svc->fwmark) {
- union nf_inet_addr fwmark = {
- .ip = htonl(svc->fwmark)
- };
-
- ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
- &fwmark, 0);
- } else
- ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
- &iph.daddr, 0);
-
- if (!ct || !ip_vs_check_template(ct)) {
- /*
- * If it is not persistent port zero, return NULL,
- * otherwise create a connection template.
+ /* Note: persistent fwmark-based services and
+ * persistent port zero service are handled here.
+ * fwmark template:
+ * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+ * port zero template:
+ * <protocol,caddr,0,vaddr,0,daddr,0>
*/
- if (svc->port)
- return NULL;
-
- dest = svc->scheduler->schedule(svc, skb);
- if (dest == NULL) {
- IP_VS_DBG(1, "p-schedule: no dest found.\n");
- return NULL;
+ if (svc->fwmark) {
+ protocol = IPPROTO_IP;
+ vaddr = &fwmark;
}
+ }
+ ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+ vaddr, vport, &param);
+ }
- /*
- * Create a template according to the service
- */
- if (svc->fwmark) {
- union nf_inet_addr fwmark = {
- .ip = htonl(svc->fwmark)
- };
-
- ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
- &snet, 0,
- &fwmark, 0,
- &dest->addr, 0,
- IP_VS_CONN_F_TEMPLATE,
- dest);
- } else
- ct = ip_vs_conn_new(svc->af, iph.protocol,
- &snet, 0,
- &iph.daddr, 0,
- &dest->addr, 0,
- IP_VS_CONN_F_TEMPLATE,
- dest);
- if (ct == NULL)
- return NULL;
-
- ct->timeout = svc->timeout;
- } else {
- /* set destination with the found template */
- dest = ct->dest;
+ /* Check if a template already exists */
+ ct = ip_vs_ct_in_get(&param);
+ if (!ct || !ip_vs_check_template(ct)) {
+ /* No template found or the dest of the connection
+ * template is not available.
+ */
+ dest = svc->scheduler->schedule(svc, skb);
+ if (!dest) {
+ IP_VS_DBG(1, "p-schedule: no dest found.\n");
+ kfree(param.pe_data);
+ return NULL;
}
- dport = ports[1];
+
+ if (ports[1] == svc->port && svc->port != FTPPORT)
+ dport = dest->port;
+
+ /* Create a template
+ * This adds param.pe_data to the template,
+ * and thus param.pe_data will be destroyed
+ * when the template expires */
+ ct = ip_vs_conn_new(&param, &dest->addr, dport,
+ IP_VS_CONN_F_TEMPLATE, dest);
+ if (ct == NULL) {
+ kfree(param.pe_data);
+ return NULL;
+ }
+
+ ct->timeout = svc->timeout;
+ } else {
+ /* set destination with the found template */
+ dest = ct->dest;
+ kfree(param.pe_data);
}
+ dport = ports[1];
+ if (dport == svc->port && dest->port)
+ dport = dest->port;
+
flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
&& iph.protocol == IPPROTO_UDP)?
IP_VS_CONN_F_ONE_PACKET : 0;
@@ -347,12 +317,9 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
/*
* Create a new connection according to the template
*/
- cp = ip_vs_conn_new(svc->af, iph.protocol,
- &iph.saddr, ports[0],
- &iph.daddr, ports[1],
- &dest->addr, dport,
- flags,
- dest);
+ ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0],
+ &iph.daddr, ports[1], &param);
+ cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest);
if (cp == NULL) {
ip_vs_conn_put(ct);
return NULL;
@@ -376,23 +343,53 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
* Protocols supported: TCP, UDP
*/
struct ip_vs_conn *
-ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
+ struct ip_vs_protocol *pp, int *ignored)
{
struct ip_vs_conn *cp = NULL;
struct ip_vs_iphdr iph;
struct ip_vs_dest *dest;
- __be16 _ports[2], *pptr, flags;
+ __be16 _ports[2], *pptr;
+ unsigned int flags;
+ *ignored = 1;
ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
if (pptr == NULL)
return NULL;
/*
+ * FTPDATA needs this check when using local real server.
+ * Never schedule Active FTPDATA connections from real server.
+ * For LVS-NAT they must be already created. For other methods
+ * with persistence the connection is created on SYN+ACK.
+ */
+ if (pptr[0] == FTPDATA) {
+ IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+ "Not scheduling FTPDATA");
+ return NULL;
+ }
+
+ /*
+ * Do not schedule replies from local real server. It is risky
+ * for fwmark services but mostly for persistent services.
+ */
+ if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+ (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
+ (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
+ IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+ "Not scheduling reply for existing connection");
+ __ip_vs_conn_put(cp);
+ return NULL;
+ }
+
+ /*
* Persistent service
*/
- if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
+ *ignored = 0;
return ip_vs_sched_persist(svc, skb, pptr);
+ }
/*
* Non-persistent service
@@ -405,6 +402,8 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
return NULL;
}
+ *ignored = 0;
+
dest = svc->scheduler->schedule(svc, skb);
if (dest == NULL) {
IP_VS_DBG(1, "Schedule: no dest found.\n");
@@ -418,14 +417,16 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
/*
* Create a connection entry.
*/
- cp = ip_vs_conn_new(svc->af, iph.protocol,
- &iph.saddr, pptr[0],
- &iph.daddr, pptr[1],
- &dest->addr, dest->port ? dest->port : pptr[1],
- flags,
- dest);
- if (cp == NULL)
- return NULL;
+ {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr,
+ pptr[0], &iph.daddr, pptr[1], &p);
+ cp = ip_vs_conn_new(&p, &dest->addr,
+ dest->port ? dest->port : pptr[1],
+ flags, dest);
+ if (!cp)
+ return NULL;
+ }
IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
"d:%s:%u conn->flags:%X conn->refcnt:%d\n",
@@ -472,23 +473,26 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
int ret, cs;
struct ip_vs_conn *cp;
- __u16 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
- iph.protocol == IPPROTO_UDP)?
- IP_VS_CONN_F_ONE_PACKET : 0;
+ unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
+ iph.protocol == IPPROTO_UDP)?
+ IP_VS_CONN_F_ONE_PACKET : 0;
union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
ip_vs_service_put(svc);
/* create a new connection entry */
IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
- cp = ip_vs_conn_new(svc->af, iph.protocol,
- &iph.saddr, pptr[0],
- &iph.daddr, pptr[1],
- &daddr, 0,
- IP_VS_CONN_F_BYPASS | flags,
- NULL);
- if (cp == NULL)
- return NF_DROP;
+ {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(svc->af, iph.protocol,
+ &iph.saddr, pptr[0],
+ &iph.daddr, pptr[1], &p);
+ cp = ip_vs_conn_new(&p, &daddr, 0,
+ IP_VS_CONN_F_BYPASS | flags,
+ NULL);
+ if (!cp)
+ return NF_DROP;
+ }
/* statistics */
ip_vs_in_stats(cp, skb);
@@ -526,9 +530,14 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
* ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
*/
#ifdef CONFIG_IP_VS_IPV6
- if (svc->af == AF_INET6)
+ if (svc->af == AF_INET6) {
+ if (!skb->dev) {
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ skb->dev = net->loopback_dev;
+ }
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
- else
+ } else
#endif
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
@@ -540,6 +549,15 @@ __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
}
+static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
+{
+ if (NF_INET_LOCAL_IN == hooknum)
+ return IP_DEFRAG_VS_IN;
+ if (NF_INET_FORWARD == hooknum)
+ return IP_DEFRAG_VS_FWD;
+ return IP_DEFRAG_VS_OUT;
+}
+
static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
{
int err = ip_defrag(skb, user);
@@ -600,10 +618,10 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
skb->ip_summed = CHECKSUM_UNNECESSARY;
if (inout)
- IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
"Forwarding altered outgoing ICMP");
else
- IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
"Forwarding altered incoming ICMP");
}
@@ -637,17 +655,21 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
}
/* And finally the ICMP checksum */
- icmph->icmp6_cksum = 0;
- /* TODO IPv6: is this correct for ICMPv6? */
- ip_vs_checksum_complete(skb, icmp_offset);
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
+ skb->len - icmp_offset,
+ IPPROTO_ICMPV6, 0);
+ skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
+ skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
+ skb->ip_summed = CHECKSUM_PARTIAL;
if (inout)
- IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
- "Forwarding altered outgoing ICMPv6");
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+ (void *)ciph - (void *)iph,
+ "Forwarding altered outgoing ICMPv6");
else
- IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
- "Forwarding altered incoming ICMPv6");
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+ (void *)ciph - (void *)iph,
+ "Forwarding altered incoming ICMPv6");
}
#endif
@@ -688,10 +710,25 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
#endif
ip_vs_nat_icmp(skb, pp, cp, 1);
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
+ goto out;
+ } else
+#endif
+ if ((sysctl_ip_vs_snat_reroute ||
+ skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+ ip_route_me_harder(skb, RTN_LOCAL) != 0)
+ goto out;
+
/* do the statistics and put it back */
ip_vs_out_stats(cp, skb);
skb->ipvs_property = 1;
+ if (!(cp->flags & IP_VS_CONN_F_NFCT))
+ ip_vs_notrack(skb);
+ else
+ ip_vs_update_conntrack(skb, cp, 0);
verdict = NF_ACCEPT;
out:
@@ -705,7 +742,8 @@ out:
* Find any that might be relevant, check against existing connections.
* Currently handles error types - unreachable, quench, ttl exceeded.
*/
-static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
+static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
+ unsigned int hooknum)
{
struct iphdr *iph;
struct icmphdr _icmph, *ic;
@@ -720,7 +758,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
/* reassemble IP fragments */
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
- if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+ if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
}
@@ -763,7 +801,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
pp->dont_defrag))
return NF_ACCEPT;
- IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+ "Checking outgoing ICMP for");
offset += cih->ihl * 4;
@@ -779,7 +818,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
}
#ifdef CONFIG_IP_VS_IPV6
-static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
+static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
+ unsigned int hooknum)
{
struct ipv6hdr *iph;
struct icmp6hdr _icmph, *ic;
@@ -795,7 +835,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
/* reassemble IP fragments */
if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
- if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
+ if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
}
@@ -838,7 +878,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
return NF_ACCEPT;
- IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
+ "Checking outgoing ICMPv6 for");
offset += sizeof(struct ipv6hdr);
@@ -886,7 +927,7 @@ static unsigned int
handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
struct ip_vs_conn *cp, int ihl)
{
- IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
+ IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
if (!skb_make_writable(skb, ihl))
goto drop;
@@ -905,6 +946,15 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
ip_send_check(ip_hdr(skb));
}
+ /*
+ * nf_iterate does not expect change in the skb->dst->dev.
+ * It looks like it is not fatal to enable this code for hooks
+ * where our handlers are at the end of the chain list and
+ * when all next handlers use skb->dst->dev and not outdev.
+ * It will definitely route properly the inout NAT traffic
+ * when multiple paths are used.
+ */
+
/* For policy routing, packets originating from this
* machine itself may be routed differently to packets
* passing through. We want this packet to be routed as
@@ -913,20 +963,25 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
*/
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
- if (ip6_route_me_harder(skb) != 0)
+ if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
goto drop;
} else
#endif
- if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
+ if ((sysctl_ip_vs_snat_reroute ||
+ skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+ ip_route_me_harder(skb, RTN_LOCAL) != 0)
goto drop;
- IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
+ IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
ip_vs_out_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
- ip_vs_conn_put(cp);
-
skb->ipvs_property = 1;
+ if (!(cp->flags & IP_VS_CONN_F_NFCT))
+ ip_vs_notrack(skb);
+ else
+ ip_vs_update_conntrack(skb, cp, 0);
+ ip_vs_conn_put(cp);
LeaveFunction(11);
return NF_ACCEPT;
@@ -934,35 +989,46 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
drop:
ip_vs_conn_put(cp);
kfree_skb(skb);
+ LeaveFunction(11);
return NF_STOLEN;
}
/*
- * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
* Check if outgoing packet belongs to the established ip_vs_conn.
*/
static unsigned int
-ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
{
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_conn *cp;
- int af;
EnterFunction(11);
- af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
-
+ /* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
return NF_ACCEPT;
+ /* Bad... Do not break raw sockets */
+ if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+ af == AF_INET)) {
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(skb->sk);
+
+ if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+ return NF_ACCEPT;
+ }
+
+ if (unlikely(!skb_dst(skb)))
+ return NF_ACCEPT;
+
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
- int related, verdict = ip_vs_out_icmp_v6(skb, &related);
+ int related;
+ int verdict = ip_vs_out_icmp_v6(skb, &related,
+ hooknum);
if (related)
return verdict;
@@ -971,7 +1037,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
} else
#endif
if (unlikely(iph.protocol == IPPROTO_ICMP)) {
- int related, verdict = ip_vs_out_icmp(skb, &related);
+ int related;
+ int verdict = ip_vs_out_icmp(skb, &related, hooknum);
if (related)
return verdict;
@@ -985,19 +1052,19 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
/* reassemble IP fragments */
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
- if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
- int related, verdict = ip_vs_out_icmp_v6(skb, &related);
-
- if (related)
- return verdict;
-
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+ if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
+ if (ip_vs_gather_frags_v6(skb,
+ ip_vs_defrag_user(hooknum)))
+ return NF_STOLEN;
}
+
+ ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
} else
#endif
if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
!pp->dont_defrag)) {
- if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+ if (ip_vs_gather_frags(skb,
+ ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
@@ -1008,55 +1075,123 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
*/
cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
- if (unlikely(!cp)) {
- if (sysctl_ip_vs_nat_icmp_send &&
- (pp->protocol == IPPROTO_TCP ||
- pp->protocol == IPPROTO_UDP ||
- pp->protocol == IPPROTO_SCTP)) {
- __be16 _ports[2], *pptr;
-
- pptr = skb_header_pointer(skb, iph.len,
- sizeof(_ports), _ports);
- if (pptr == NULL)
- return NF_ACCEPT; /* Not for me */
- if (ip_vs_lookup_real_service(af, iph.protocol,
- &iph.saddr,
- pptr[0])) {
- /*
- * Notify the real server: there is no
- * existing entry if it is not RST
- * packet or not TCP packet.
- */
- if ((iph.protocol != IPPROTO_TCP &&
- iph.protocol != IPPROTO_SCTP)
- || ((iph.protocol == IPPROTO_TCP
- && !is_tcp_reset(skb, iph.len))
- || (iph.protocol == IPPROTO_SCTP
- && !is_sctp_abort(skb,
- iph.len)))) {
+ if (likely(cp))
+ return handle_response(af, skb, pp, cp, iph.len);
+ if (sysctl_ip_vs_nat_icmp_send &&
+ (pp->protocol == IPPROTO_TCP ||
+ pp->protocol == IPPROTO_UDP ||
+ pp->protocol == IPPROTO_SCTP)) {
+ __be16 _ports[2], *pptr;
+
+ pptr = skb_header_pointer(skb, iph.len,
+ sizeof(_ports), _ports);
+ if (pptr == NULL)
+ return NF_ACCEPT; /* Not for me */
+ if (ip_vs_lookup_real_service(af, iph.protocol,
+ &iph.saddr,
+ pptr[0])) {
+ /*
+ * Notify the real server: there is no
+ * existing entry if it is not RST
+ * packet or not TCP packet.
+ */
+ if ((iph.protocol != IPPROTO_TCP &&
+ iph.protocol != IPPROTO_SCTP)
+ || ((iph.protocol == IPPROTO_TCP
+ && !is_tcp_reset(skb, iph.len))
+ || (iph.protocol == IPPROTO_SCTP
+ && !is_sctp_abort(skb,
+ iph.len)))) {
#ifdef CONFIG_IP_VS_IPV6
- if (af == AF_INET6)
- icmpv6_send(skb,
- ICMPV6_DEST_UNREACH,
- ICMPV6_PORT_UNREACH,
- 0);
- else
+ if (af == AF_INET6) {
+ struct net *net =
+ dev_net(skb_dst(skb)->dev);
+
+ if (!skb->dev)
+ skb->dev = net->loopback_dev;
+ icmpv6_send(skb,
+ ICMPV6_DEST_UNREACH,
+ ICMPV6_PORT_UNREACH,
+ 0);
+ } else
#endif
- icmp_send(skb,
- ICMP_DEST_UNREACH,
- ICMP_PORT_UNREACH, 0);
- return NF_DROP;
- }
+ icmp_send(skb,
+ ICMP_DEST_UNREACH,
+ ICMP_PORT_UNREACH, 0);
+ return NF_DROP;
}
}
- IP_VS_DBG_PKT(12, pp, skb, 0,
- "packet continues traversal as normal");
- return NF_ACCEPT;
}
+ IP_VS_DBG_PKT(12, af, pp, skb, 0,
+ "ip_vs_out: packet continues traversal as normal");
+ return NF_ACCEPT;
+}
+
+/*
+ * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ * used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ip_vs_out(hooknum, skb, AF_INET);
+}
+
+/*
+ * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ unsigned int verdict;
+
+ /* Disable BH in LOCAL_OUT until all places are fixed */
+ local_bh_disable();
+ verdict = ip_vs_out(hooknum, skb, AF_INET);
+ local_bh_enable();
+ return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ * used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ip_vs_out(hooknum, skb, AF_INET6);
+}
- return handle_response(af, skb, pp, cp, iph.len);
+/*
+ * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ unsigned int verdict;
+
+ /* Disable BH in LOCAL_OUT until all places are fixed */
+ local_bh_disable();
+ verdict = ip_vs_out(hooknum, skb, AF_INET6);
+ local_bh_enable();
+ return verdict;
}
+#endif
/*
* Handle ICMP messages in the outside-to-inside direction (incoming).
@@ -1080,8 +1215,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
/* reassemble IP fragments */
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
- if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
- IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
+ if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
}
@@ -1124,7 +1258,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
pp->dont_defrag))
return NF_ACCEPT;
- IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+ "Checking incoming ICMP for");
offset += cih->ihl * 4;
@@ -1158,7 +1293,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
offset += 2 * sizeof(__u16);
verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
- /* do not touch skb anymore */
+ /* LOCALNODE from FORWARD hook is not supported */
+ if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
+ skb_rtable(skb)->rt_flags & RTCF_LOCAL) {
+ IP_VS_DBG(1, "%s(): "
+ "local delivery to %pI4 but in FORWARD\n",
+ __func__, &skb_rtable(skb)->rt_dst);
+ verdict = NF_DROP;
+ }
out:
__ip_vs_conn_put(cp);
@@ -1179,14 +1321,13 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
struct ip_vs_protocol *pp;
unsigned int offset, verdict;
union nf_inet_addr snet;
+ struct rt6_info *rt;
*related = 1;
/* reassemble IP fragments */
if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
- if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
- IP_DEFRAG_VS_IN :
- IP_DEFRAG_VS_FWD))
+ if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
return NF_STOLEN;
}
@@ -1229,7 +1370,8 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
return NF_ACCEPT;
- IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
+ "Checking incoming ICMPv6 for");
offset += sizeof(struct ipv6hdr);
@@ -1257,7 +1399,15 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
IPPROTO_SCTP == cih->nexthdr)
offset += 2 * sizeof(__u16);
verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
- /* do not touch skb anymore */
+ /* LOCALNODE from FORWARD hook is not supported */
+ if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
+ (rt = (struct rt6_info *) skb_dst(skb)) &&
+ rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK) {
+ IP_VS_DBG(1, "%s(): "
+ "local delivery to %pI6 but in FORWARD\n",
+ __func__, &rt->rt6i_dst);
+ verdict = NF_DROP;
+ }
__ip_vs_conn_put(cp);
@@ -1271,35 +1421,49 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
* and send it on its way...
*/
static unsigned int
-ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
{
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_conn *cp;
- int ret, restart, af, pkts;
+ int ret, restart, pkts;
- af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
-
- ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+ /* Already marked as IPVS request or reply? */
+ if (skb->ipvs_property)
+ return NF_ACCEPT;
/*
- * Big tappo: only PACKET_HOST, including loopback for local client
- * Don't handle local packets on IPv6 for now
+ * Big tappo:
+ * - remote client: only PACKET_HOST
+ * - route: used for struct net when skb->dev is unset
*/
- if (unlikely(skb->pkt_type != PACKET_HOST)) {
- IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
- skb->pkt_type,
- iph.protocol,
- IP_VS_DBG_ADDR(af, &iph.daddr));
+ if (unlikely((skb->pkt_type != PACKET_HOST &&
+ hooknum != NF_INET_LOCAL_OUT) ||
+ !skb_dst(skb))) {
+ ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+ IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
+ " ignored in hook %u\n",
+ skb->pkt_type, iph.protocol,
+ IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
return NF_ACCEPT;
}
+ ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+
+ /* Bad... Do not break raw sockets */
+ if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+ af == AF_INET)) {
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(skb->sk);
+
+ if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+ return NF_ACCEPT;
+ }
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
- int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
+ int related;
+ int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
if (related)
return verdict;
@@ -1308,7 +1472,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
} else
#endif
if (unlikely(iph.protocol == IPPROTO_ICMP)) {
- int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
+ int related;
+ int verdict = ip_vs_in_icmp(skb, &related, hooknum);
if (related)
return verdict;
@@ -1328,23 +1493,18 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
if (unlikely(!cp)) {
int v;
- /* For local client packets, it could be a response */
- cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
- if (cp)
- return handle_response(af, skb, pp, cp, iph.len);
-
if (!pp->conn_schedule(af, skb, pp, &v, &cp))
return v;
}
if (unlikely(!cp)) {
/* sorry, all this trouble for a no-hit :) */
- IP_VS_DBG_PKT(12, pp, skb, 0,
- "packet continues traversal as normal");
+ IP_VS_DBG_PKT(12, af, pp, skb, 0,
+ "ip_vs_in: packet continues traversal as normal");
return NF_ACCEPT;
}
- IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
+ IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
/* Check the server status */
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
@@ -1380,8 +1540,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
cp->protocol == IPPROTO_SCTP) {
if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
- (atomic_read(&cp->in_pkts) %
- sysctl_ip_vs_sync_threshold[1]
+ (pkts % sysctl_ip_vs_sync_threshold[1]
== sysctl_ip_vs_sync_threshold[0])) ||
(cp->old_state != cp->state &&
((cp->state == IP_VS_SCTP_S_CLOSED) ||
@@ -1392,7 +1551,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
}
}
- if (af == AF_INET &&
+ /* Keep this block last: TCP and others with pp->num_states <= 1 */
+ else if (af == AF_INET &&
(ip_vs_sync_state & IP_VS_STATE_MASTER) &&
(((cp->protocol != IPPROTO_TCP ||
cp->state == IP_VS_TCP_S_ESTABLISHED) &&
@@ -1411,6 +1571,72 @@ out:
return ret;
}
+/*
+ * AF_INET handler in NF_INET_LOCAL_IN chain
+ * Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ip_vs_in(hooknum, skb, AF_INET);
+}
+
+/*
+ * AF_INET handler in NF_INET_LOCAL_OUT chain
+ * Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ unsigned int verdict;
+
+ /* Disable BH in LOCAL_OUT until all places are fixed */
+ local_bh_disable();
+ verdict = ip_vs_in(hooknum, skb, AF_INET);
+ local_bh_enable();
+ return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ * AF_INET6 handler in NF_INET_LOCAL_IN chain
+ * Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ip_vs_in(hooknum, skb, AF_INET6);
+}
+
+/*
+ * AF_INET6 handler in NF_INET_LOCAL_OUT chain
+ * Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ unsigned int verdict;
+
+ /* Disable BH in LOCAL_OUT until all places are fixed */
+ local_bh_disable();
+ verdict = ip_vs_in(hooknum, skb, AF_INET6);
+ local_bh_enable();
+ return verdict;
+}
+
+#endif
+
/*
* It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
@@ -1451,23 +1677,39 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_reply4,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = 99,
+ },
/* After packet filtering, forward packet through VS/DR, VS/TUN,
* or VS/NAT(change destination), so that filtering rules can be
* applied to IPVS. */
{
- .hook = ip_vs_in,
+ .hook = ip_vs_remote_request4,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = 100,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = 101,
},
- /* After packet filtering, change source only for VS/NAT */
+ /* Before ip_vs_in, change source only for VS/NAT */
{
- .hook = ip_vs_out,
+ .hook = ip_vs_local_reply4,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_INET_FORWARD,
- .priority = 100,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = -99,
+ },
+ /* After mangle, schedule and forward local requests */
+ {
+ .hook = ip_vs_local_request4,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = -98,
},
/* After packet filtering (but before ip_vs_out_icmp), catch icmp
* destined for 0.0.0.0/0, which is for incoming IPVS connections */
@@ -1475,27 +1717,51 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
.hook = ip_vs_forward_icmp,
.owner = THIS_MODULE,
.pf = PF_INET,
- .hooknum = NF_INET_FORWARD,
- .priority = 99,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 99,
+ },
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_reply4,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 100,
},
#ifdef CONFIG_IP_VS_IPV6
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_reply6,
+ .owner = THIS_MODULE,
+ .pf = PF_INET6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = 99,
+ },
/* After packet filtering, forward packet through VS/DR, VS/TUN,
* or VS/NAT(change destination), so that filtering rules can be
* applied to IPVS. */
{
- .hook = ip_vs_in,
+ .hook = ip_vs_remote_request6,
.owner = THIS_MODULE,
.pf = PF_INET6,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = 100,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = 101,
},
- /* After packet filtering, change source only for VS/NAT */
+ /* Before ip_vs_in, change source only for VS/NAT */
+ {
+ .hook = ip_vs_local_reply6,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = -99,
+ },
+ /* After mangle, schedule and forward local requests */
{
- .hook = ip_vs_out,
+ .hook = ip_vs_local_request6,
.owner = THIS_MODULE,
.pf = PF_INET6,
- .hooknum = NF_INET_FORWARD,
- .priority = 100,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = -98,
},
/* After packet filtering (but before ip_vs_out_icmp), catch icmp
* destined for 0.0.0.0/0, which is for incoming IPVS connections */
@@ -1503,8 +1769,16 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
.hook = ip_vs_forward_icmp_v6,
.owner = THIS_MODULE,
.pf = PF_INET6,
- .hooknum = NF_INET_FORWARD,
- .priority = 99,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 99,
+ },
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_reply6,
+ .owner = THIS_MODULE,
+ .pf = PF_INET6,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 100,
},
#endif
};
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 0f0c079c422..5f5daa30b0a 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -61,7 +61,7 @@ static DEFINE_RWLOCK(__ip_vs_svc_lock);
static DEFINE_RWLOCK(__ip_vs_rs_lock);
/* lock for state and timeout tables */
-static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
+static DEFINE_SPINLOCK(ip_vs_securetcp_lock);
/* lock for drop entry handling */
static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
@@ -88,6 +88,10 @@ int sysctl_ip_vs_expire_nodest_conn = 0;
int sysctl_ip_vs_expire_quiescent_template = 0;
int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
int sysctl_ip_vs_nat_icmp_send = 0;
+#ifdef CONFIG_IP_VS_NFCT
+int sysctl_ip_vs_conntrack;
+#endif
+int sysctl_ip_vs_snat_reroute = 1;
#ifdef CONFIG_IP_VS_DEBUG
@@ -204,7 +208,7 @@ static void update_defense_level(void)
spin_unlock(&__ip_vs_droppacket_lock);
/* secure_tcp */
- write_lock(&__ip_vs_securetcp_lock);
+ spin_lock(&ip_vs_securetcp_lock);
switch (sysctl_ip_vs_secure_tcp) {
case 0:
if (old_secure_tcp >= 2)
@@ -238,7 +242,7 @@ static void update_defense_level(void)
old_secure_tcp = sysctl_ip_vs_secure_tcp;
if (to_change >= 0)
ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
- write_unlock(&__ip_vs_securetcp_lock);
+ spin_unlock(&ip_vs_securetcp_lock);
local_bh_enable();
}
@@ -401,7 +405,7 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
* Get service by {proto,addr,port} in the service table.
*/
static inline struct ip_vs_service *
-__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
+__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
__be16 vport)
{
unsigned hash;
@@ -416,7 +420,6 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
&& (svc->port == vport)
&& (svc->protocol == protocol)) {
/* HIT */
- atomic_inc(&svc->usecnt);
return svc;
}
}
@@ -429,7 +432,7 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
* Get service by {fwmark} in the service table.
*/
static inline struct ip_vs_service *
-__ip_vs_svc_fwm_get(int af, __u32 fwmark)
+__ip_vs_svc_fwm_find(int af, __u32 fwmark)
{
unsigned hash;
struct ip_vs_service *svc;
@@ -440,7 +443,6 @@ __ip_vs_svc_fwm_get(int af, __u32 fwmark)
list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
if (svc->fwmark == fwmark && svc->af == af) {
/* HIT */
- atomic_inc(&svc->usecnt);
return svc;
}
}
@@ -459,14 +461,14 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
/*
* Check the table hashed by fwmark first
*/
- if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark)))
+ if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark)))
goto out;
/*
* Check the table hashed by <protocol,addr,port>
* for "full" addressed entries
*/
- svc = __ip_vs_service_get(af, protocol, vaddr, vport);
+ svc = __ip_vs_service_find(af, protocol, vaddr, vport);
if (svc == NULL
&& protocol == IPPROTO_TCP
@@ -476,7 +478,7 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
* Check if ftp service entry exists, the packet
* might belong to FTP data connections.
*/
- svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT);
+ svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT);
}
if (svc == NULL
@@ -484,10 +486,12 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
/*
* Check if the catch-all port (port zero) exists
*/
- svc = __ip_vs_service_get(af, protocol, vaddr, 0);
+ svc = __ip_vs_service_find(af, protocol, vaddr, 0);
}
out:
+ if (svc)
+ atomic_inc(&svc->usecnt);
read_unlock(&__ip_vs_svc_lock);
IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
@@ -506,14 +510,19 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
dest->svc = svc;
}
-static inline void
+static void
__ip_vs_unbind_svc(struct ip_vs_dest *dest)
{
struct ip_vs_service *svc = dest->svc;
dest->svc = NULL;
- if (atomic_dec_and_test(&svc->refcnt))
+ if (atomic_dec_and_test(&svc->refcnt)) {
+ IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+ svc->fwmark,
+ IP_VS_DBG_ADDR(svc->af, &svc->addr),
+ ntohs(svc->port), atomic_read(&svc->usecnt));
kfree(svc);
+ }
}
@@ -758,31 +767,18 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
* Update a destination in the given service
*/
static void
-__ip_vs_update_dest(struct ip_vs_service *svc,
- struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest)
+__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
+ struct ip_vs_dest_user_kern *udest, int add)
{
int conn_flags;
/* set the weight and the flags */
atomic_set(&dest->weight, udest->weight);
- conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
-
- /* check if local node and update the flags */
-#ifdef CONFIG_IP_VS_IPV6
- if (svc->af == AF_INET6) {
- if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) {
- conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
- | IP_VS_CONN_F_LOCALNODE;
- }
- } else
-#endif
- if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
- conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
- | IP_VS_CONN_F_LOCALNODE;
- }
+ conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
+ conn_flags |= IP_VS_CONN_F_INACTIVE;
/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
- if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
+ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
conn_flags |= IP_VS_CONN_F_NOOUTPUT;
} else {
/*
@@ -813,6 +809,29 @@ __ip_vs_update_dest(struct ip_vs_service *svc,
dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
dest->u_threshold = udest->u_threshold;
dest->l_threshold = udest->l_threshold;
+
+ spin_lock(&dest->dst_lock);
+ ip_vs_dst_reset(dest);
+ spin_unlock(&dest->dst_lock);
+
+ if (add)
+ ip_vs_new_estimator(&dest->stats);
+
+ write_lock_bh(&__ip_vs_svc_lock);
+
+ /* Wait until all other svc users go away */
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+
+ if (add) {
+ list_add(&dest->n_list, &svc->destinations);
+ svc->num_dests++;
+ }
+
+ /* call the update_service, because server weight may be changed */
+ if (svc->scheduler->update_service)
+ svc->scheduler->update_service(svc);
+
+ write_unlock_bh(&__ip_vs_svc_lock);
}
@@ -843,7 +862,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
return -EINVAL;
}
- dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
+ dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
if (dest == NULL) {
pr_err("%s(): no memory.\n", __func__);
return -ENOMEM;
@@ -860,13 +879,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
atomic_set(&dest->activeconns, 0);
atomic_set(&dest->inactconns, 0);
atomic_set(&dest->persistconns, 0);
- atomic_set(&dest->refcnt, 0);
+ atomic_set(&dest->refcnt, 1);
INIT_LIST_HEAD(&dest->d_list);
spin_lock_init(&dest->dst_lock);
spin_lock_init(&dest->stats.lock);
- __ip_vs_update_dest(svc, dest, udest);
- ip_vs_new_estimator(&dest->stats);
+ __ip_vs_update_dest(svc, dest, udest, 1);
*dest_p = dest;
@@ -926,65 +944,22 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
ntohs(dest->vport));
- __ip_vs_update_dest(svc, dest, udest);
-
/*
* Get the destination from the trash
*/
list_del(&dest->n_list);
- ip_vs_new_estimator(&dest->stats);
-
- write_lock_bh(&__ip_vs_svc_lock);
-
+ __ip_vs_update_dest(svc, dest, udest, 1);
+ ret = 0;
+ } else {
/*
- * Wait until all other svc users go away.
+ * Allocate and initialize the dest structure
*/
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
- list_add(&dest->n_list, &svc->destinations);
- svc->num_dests++;
-
- /* call the update_service function of its scheduler */
- if (svc->scheduler->update_service)
- svc->scheduler->update_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
- return 0;
- }
-
- /*
- * Allocate and initialize the dest structure
- */
- ret = ip_vs_new_dest(svc, udest, &dest);
- if (ret) {
- return ret;
+ ret = ip_vs_new_dest(svc, udest, &dest);
}
-
- /*
- * Add the dest entry into the list
- */
- atomic_inc(&dest->refcnt);
-
- write_lock_bh(&__ip_vs_svc_lock);
-
- /*
- * Wait until all other svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
- list_add(&dest->n_list, &svc->destinations);
- svc->num_dests++;
-
- /* call the update_service function of its scheduler */
- if (svc->scheduler->update_service)
- svc->scheduler->update_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
-
LeaveFunction(2);
- return 0;
+ return ret;
}
@@ -1023,19 +998,7 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
return -ENOENT;
}
- __ip_vs_update_dest(svc, dest, udest);
-
- write_lock_bh(&__ip_vs_svc_lock);
-
- /* Wait until all other svc users go away */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
- /* call the update_service, because server weight may be changed */
- if (svc->scheduler->update_service)
- svc->scheduler->update_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
-
+ __ip_vs_update_dest(svc, dest, udest, 0);
LeaveFunction(2);
return 0;
@@ -1062,6 +1025,10 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
* the destination into the trash.
*/
if (atomic_dec_and_test(&dest->refcnt)) {
+ IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
+ dest->vfwmark,
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port));
ip_vs_dst_reset(dest);
/* simply decrease svc->refcnt here, let the caller check
and release the service if nobody refers to it.
@@ -1128,7 +1095,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
/*
* Wait until all other svc users go away.
*/
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
/*
* Unlink dest from the service
@@ -1157,6 +1124,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
{
int ret = 0;
struct ip_vs_scheduler *sched = NULL;
+ struct ip_vs_pe *pe = NULL;
struct ip_vs_service *svc = NULL;
/* increase the module use count */
@@ -1167,7 +1135,17 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
if (sched == NULL) {
pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
ret = -ENOENT;
- goto out_mod_dec;
+ goto out_err;
+ }
+
+ if (u->pe_name && *u->pe_name) {
+ pe = ip_vs_pe_get(u->pe_name);
+ if (pe == NULL) {
+ pr_info("persistence engine module ip_vs_pe_%s "
+ "not found\n", u->pe_name);
+ ret = -ENOENT;
+ goto out_err;
+ }
}
#ifdef CONFIG_IP_VS_IPV6
@@ -1177,7 +1155,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
}
#endif
- svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
+ svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
if (svc == NULL) {
IP_VS_DBG(1, "%s(): no memory\n", __func__);
ret = -ENOMEM;
@@ -1185,7 +1163,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
}
/* I'm the first user of the service */
- atomic_set(&svc->usecnt, 1);
+ atomic_set(&svc->usecnt, 0);
atomic_set(&svc->refcnt, 0);
svc->af = u->af;
@@ -1207,6 +1185,10 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
goto out_err;
sched = NULL;
+ /* Bind the ct retriever */
+ ip_vs_bind_pe(svc, pe);
+ pe = NULL;
+
/* Update the virtual service counters */
if (svc->port == FTPPORT)
atomic_inc(&ip_vs_ftpsvc_counter);
@@ -1227,10 +1209,9 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
*svc_p = svc;
return 0;
- out_err:
+ out_err:
if (svc != NULL) {
- if (svc->scheduler)
- ip_vs_unbind_scheduler(svc);
+ ip_vs_unbind_scheduler(svc);
if (svc->inc) {
local_bh_disable();
ip_vs_app_inc_put(svc->inc);
@@ -1239,8 +1220,8 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
kfree(svc);
}
ip_vs_scheduler_put(sched);
+ ip_vs_pe_put(pe);
- out_mod_dec:
/* decrease the module use count */
ip_vs_use_count_dec();
@@ -1255,6 +1236,7 @@ static int
ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
{
struct ip_vs_scheduler *sched, *old_sched;
+ struct ip_vs_pe *pe = NULL, *old_pe = NULL;
int ret = 0;
/*
@@ -1267,6 +1249,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
}
old_sched = sched;
+ if (u->pe_name && *u->pe_name) {
+ pe = ip_vs_pe_get(u->pe_name);
+ if (pe == NULL) {
+ pr_info("persistence engine module ip_vs_pe_%s "
+ "not found\n", u->pe_name);
+ ret = -ENOENT;
+ goto out;
+ }
+ old_pe = pe;
+ }
+
#ifdef CONFIG_IP_VS_IPV6
if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
ret = -EINVAL;
@@ -1279,7 +1272,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
/*
* Wait until all other svc users go away.
*/
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
/*
* Set the flags and timeout value
@@ -1318,15 +1311,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
}
}
+ old_pe = svc->pe;
+ if (pe != old_pe) {
+ ip_vs_unbind_pe(svc);
+ ip_vs_bind_pe(svc, pe);
+ }
+
out_unlock:
write_unlock_bh(&__ip_vs_svc_lock);
-#ifdef CONFIG_IP_VS_IPV6
out:
-#endif
-
- if (old_sched)
- ip_vs_scheduler_put(old_sched);
-
+ ip_vs_scheduler_put(old_sched);
+ ip_vs_pe_put(old_pe);
return ret;
}
@@ -1340,6 +1335,9 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
{
struct ip_vs_dest *dest, *nxt;
struct ip_vs_scheduler *old_sched;
+ struct ip_vs_pe *old_pe;
+
+ pr_info("%s: enter\n", __func__);
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
@@ -1350,8 +1348,12 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
/* Unbind scheduler */
old_sched = svc->scheduler;
ip_vs_unbind_scheduler(svc);
- if (old_sched)
- ip_vs_scheduler_put(old_sched);
+ ip_vs_scheduler_put(old_sched);
+
+ /* Unbind persistence engine */
+ old_pe = svc->pe;
+ ip_vs_unbind_pe(svc);
+ ip_vs_pe_put(old_pe);
/* Unbind app inc */
if (svc->inc) {
@@ -1378,21 +1380,23 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
/*
* Free the service if nobody refers to it
*/
- if (atomic_read(&svc->refcnt) == 0)
+ if (atomic_read(&svc->refcnt) == 0) {
+ IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+ svc->fwmark,
+ IP_VS_DBG_ADDR(svc->af, &svc->addr),
+ ntohs(svc->port), atomic_read(&svc->usecnt));
kfree(svc);
+ }
/* decrease the module use count */
ip_vs_use_count_dec();
}
/*
- * Delete a service from the service list
+ * Unlink a service from list and try to delete it if its refcnt reached 0
*/
-static int ip_vs_del_service(struct ip_vs_service *svc)
+static void ip_vs_unlink_service(struct ip_vs_service *svc)
{
- if (svc == NULL)
- return -EEXIST;
-
/*
* Unhash it from the service table
*/
@@ -1403,11 +1407,21 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
/*
* Wait until all the svc users go away.
*/
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
__ip_vs_del_service(svc);
write_unlock_bh(&__ip_vs_svc_lock);
+}
+
+/*
+ * Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+ if (svc == NULL)
+ return -EEXIST;
+ ip_vs_unlink_service(svc);
return 0;
}
@@ -1426,14 +1440,7 @@ static int ip_vs_flush(void)
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
- write_lock_bh(&__ip_vs_svc_lock);
- ip_vs_svc_unhash(svc);
- /*
- * Wait until all the svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
- __ip_vs_del_service(svc);
- write_unlock_bh(&__ip_vs_svc_lock);
+ ip_vs_unlink_service(svc);
}
}
@@ -1443,14 +1450,7 @@ static int ip_vs_flush(void)
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
list_for_each_entry_safe(svc, nxt,
&ip_vs_svc_fwm_table[idx], f_list) {
- write_lock_bh(&__ip_vs_svc_lock);
- ip_vs_svc_unhash(svc);
- /*
- * Wait until all the svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
- __ip_vs_del_service(svc);
- write_unlock_bh(&__ip_vs_svc_lock);
+ ip_vs_unlink_service(svc);
}
}
@@ -1579,6 +1579,15 @@ static struct ctl_table vs_vars[] = {
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
+#ifdef CONFIG_IP_VS_NFCT
+ {
+ .procname = "conntrack",
+ .data = &sysctl_ip_vs_conntrack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
{
.procname = "secure_tcp",
.data = &sysctl_ip_vs_secure_tcp,
@@ -1586,6 +1595,13 @@ static struct ctl_table vs_vars[] = {
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
+ {
+ .procname = "snat_reroute",
+ .data = &sysctl_ip_vs_snat_reroute,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
#if 0
{
.procname = "timeout_established",
@@ -2041,6 +2057,8 @@ static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
struct ip_vs_service_user *usvc_compat)
{
+ memset(usvc, 0, sizeof(*usvc));
+
usvc->af = AF_INET;
usvc->protocol = usvc_compat->protocol;
usvc->addr.ip = usvc_compat->addr;
@@ -2058,6 +2076,8 @@ static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
struct ip_vs_dest_user *udest_compat)
{
+ memset(udest, 0, sizeof(*udest));
+
udest->addr.ip = udest_compat->addr;
udest->port = udest_compat->port;
udest->conn_flags = udest_compat->conn_flags;
@@ -2147,10 +2167,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
/* Lookup the exact service by <protocol, addr, port> or fwmark */
if (usvc.fwmark == 0)
- svc = __ip_vs_service_get(usvc.af, usvc.protocol,
- &usvc.addr, usvc.port);
+ svc = __ip_vs_service_find(usvc.af, usvc.protocol,
+ &usvc.addr, usvc.port);
else
- svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
+ svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark);
if (cmd != IP_VS_SO_SET_ADD
&& (svc == NULL || svc->protocol != usvc.protocol)) {
@@ -2189,9 +2209,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
ret = -EINVAL;
}
- if (svc)
- ip_vs_service_put(svc);
-
out_unlock:
mutex_unlock(&__ip_vs_mutex);
out_dec:
@@ -2284,10 +2301,10 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
int ret = 0;
if (get->fwmark)
- svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark);
+ svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark);
else
- svc = __ip_vs_service_get(AF_INET, get->protocol, &addr,
- get->port);
+ svc = __ip_vs_service_find(AF_INET, get->protocol, &addr,
+ get->port);
if (svc) {
int count = 0;
@@ -2315,7 +2332,6 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
}
count++;
}
- ip_vs_service_put(svc);
} else
ret = -ESRCH;
return ret;
@@ -2436,15 +2452,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
entry = (struct ip_vs_service_entry *)arg;
addr.ip = entry->addr;
if (entry->fwmark)
- svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark);
+ svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark);
else
- svc = __ip_vs_service_get(AF_INET, entry->protocol,
- &addr, entry->port);
+ svc = __ip_vs_service_find(AF_INET, entry->protocol,
+ &addr, entry->port);
if (svc) {
ip_vs_copy_service(entry, svc);
if (copy_to_user(user, entry, sizeof(*entry)) != 0)
ret = -EFAULT;
- ip_vs_service_put(svc);
} else
ret = -ESRCH;
}
@@ -2559,6 +2574,8 @@ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
[IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 },
[IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING,
.len = IP_VS_SCHEDNAME_MAXLEN },
+ [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING,
+ .len = IP_VS_PENAME_MAXLEN },
[IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY,
.len = sizeof(struct ip_vs_flags) },
[IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 },
@@ -2635,6 +2652,8 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
}
NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
+ if (svc->pe)
+ NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name);
NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
@@ -2711,10 +2730,12 @@ nla_put_failure:
}
static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
- struct nlattr *nla, int full_entry)
+ struct nlattr *nla, int full_entry,
+ struct ip_vs_service **ret_svc)
{
struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
+ struct ip_vs_service *svc;
/* Parse mandatory identifying service fields first */
if (nla == NULL ||
@@ -2750,14 +2771,21 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
usvc->fwmark = 0;
}
+ if (usvc->fwmark)
+ svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark);
+ else
+ svc = __ip_vs_service_find(usvc->af, usvc->protocol,
+ &usvc->addr, usvc->port);
+ *ret_svc = svc;
+
/* If a full entry was requested, check for the additional fields */
if (full_entry) {
- struct nlattr *nla_sched, *nla_flags, *nla_timeout,
+ struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
*nla_netmask;
struct ip_vs_flags flags;
- struct ip_vs_service *svc;
nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
+ nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
@@ -2768,21 +2796,14 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
nla_memcpy(&flags, nla_flags, sizeof(flags));
/* prefill flags from service if it already exists */
- if (usvc->fwmark)
- svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark);
- else
- svc = __ip_vs_service_get(usvc->af, usvc->protocol,
- &usvc->addr, usvc->port);
- if (svc) {
+ if (svc)
usvc->flags = svc->flags;
- ip_vs_service_put(svc);
- } else
- usvc->flags = 0;
/* set new flags from userland */
usvc->flags = (usvc->flags & ~flags.mask) |
(flags.flags & flags.mask);
usvc->sched_name = nla_data(nla_sched);
+ usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
usvc->timeout = nla_get_u32(nla_timeout);
usvc->netmask = nla_get_u32(nla_netmask);
}
@@ -2793,17 +2814,11 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
{
struct ip_vs_service_user_kern usvc;
+ struct ip_vs_service *svc;
int ret;
- ret = ip_vs_genl_parse_service(&usvc, nla, 0);
- if (ret)
- return ERR_PTR(ret);
-
- if (usvc.fwmark)
- return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
- else
- return __ip_vs_service_get(usvc.af, usvc.protocol,
- &usvc.addr, usvc.port);
+ ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc);
+ return ret ? ERR_PTR(ret) : svc;
}
static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
@@ -2894,7 +2909,6 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
nla_put_failure:
cb->args[0] = idx;
- ip_vs_service_put(svc);
out_err:
mutex_unlock(&__ip_vs_mutex);
@@ -3107,17 +3121,10 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
ret = ip_vs_genl_parse_service(&usvc,
info->attrs[IPVS_CMD_ATTR_SERVICE],
- need_full_svc);
+ need_full_svc, &svc);
if (ret)
goto out;
- /* Lookup the exact service by <protocol, addr, port> or fwmark */
- if (usvc.fwmark == 0)
- svc = __ip_vs_service_get(usvc.af, usvc.protocol,
- &usvc.addr, usvc.port);
- else
- svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-
/* Unless we're adding a new service, the service must already exist */
if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
ret = -ESRCH;
@@ -3151,6 +3158,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
break;
case IPVS_CMD_DEL_SERVICE:
ret = ip_vs_del_service(svc);
+ /* do not use svc, it can be freed */
break;
case IPVS_CMD_NEW_DEST:
ret = ip_vs_add_dest(svc, &udest);
@@ -3169,8 +3177,6 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
}
out:
- if (svc)
- ip_vs_service_put(svc);
mutex_unlock(&__ip_vs_mutex);
return ret;
@@ -3216,7 +3222,6 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
goto out_err;
} else if (svc) {
ret = ip_vs_genl_fill_service(msg, svc);
- ip_vs_service_put(svc);
if (ret)
goto nla_put_failure;
} else {
@@ -3385,6 +3390,16 @@ int __init ip_vs_control_init(void)
EnterFunction(2);
+ /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
+ INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+ }
+ for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
+ INIT_LIST_HEAD(&ip_vs_rtable[idx]);
+ }
+ smp_wmb();
+
ret = nf_register_sockopt(&ip_vs_sockopts);
if (ret) {
pr_err("cannot register sockopt.\n");
@@ -3403,15 +3418,6 @@ int __init ip_vs_control_init(void)
sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
- /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
- for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
- INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
- }
- for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
- INIT_LIST_HEAD(&ip_vs_rtable[idx]);
- }
-
ip_vs_new_estimator(&ip_vs_stats);
/* Hook the defense timer */
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index f228a17ec64..75455000ad1 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -20,17 +20,6 @@
*
* Author: Wouter Gadeyne
*
- *
- * Code for ip_vs_expect_related and ip_vs_expect_callback is taken from
- * http://www.ssi.bg/~ja/nfct/:
- *
- * ip_vs_nfct.c: Netfilter connection tracking support for IPVS
- *
- * Portions Copyright (C) 2001-2002
- * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
- *
- * Portions Copyright (C) 2003-2008
- * Julian Anastasov
*/
#define KMSG_COMPONENT "IPVS"
@@ -45,6 +34,7 @@
#include <linux/netfilter.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_helper.h>
#include <linux/gfp.h>
#include <net/protocol.h>
@@ -57,16 +47,6 @@
#define SERVER_STRING "227 Entering Passive Mode ("
#define CLIENT_STRING "PORT "
-#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u"
-#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \
- &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
- (T)->dst.protonum
-
-#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
-#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \
- &((C)->vaddr.ip), ntohs((C)->vport), \
- &((C)->daddr.ip), ntohs((C)->dport), \
- (C)->protocol, (C)->state
/*
* List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
@@ -84,6 +64,8 @@ static int ip_vs_ftp_pasv;
static int
ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
{
+ /* We use connection tracking for the command connection */
+ cp->flags |= IP_VS_CONN_F_NFCT;
return 0;
}
@@ -148,120 +130,6 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
}
/*
- * Called from init_conntrack() as expectfn handler.
- */
-static void
-ip_vs_expect_callback(struct nf_conn *ct,
- struct nf_conntrack_expect *exp)
-{
- struct nf_conntrack_tuple *orig, new_reply;
- struct ip_vs_conn *cp;
-
- if (exp->tuple.src.l3num != PF_INET)
- return;
-
- /*
- * We assume that no NF locks are held before this callback.
- * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
- * expectations even if they use wildcard values, now we provide the
- * actual values from the newly created original conntrack direction.
- * The conntrack is confirmed when packet reaches IPVS hooks.
- */
-
- /* RS->CLIENT */
- orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
- cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum,
- &orig->src.u3, orig->src.u.tcp.port,
- &orig->dst.u3, orig->dst.u.tcp.port);
- if (cp) {
- /* Change reply CLIENT->RS to CLIENT->VS */
- new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
- FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
- __func__, ct, ct->status,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
- new_reply.dst.u3 = cp->vaddr;
- new_reply.dst.u.tcp.port = cp->vport;
- IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
- ", inout cp=" FMT_CONN "\n",
- __func__, ct,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
- goto alter;
- }
-
- /* CLIENT->VS */
- cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum,
- &orig->src.u3, orig->src.u.tcp.port,
- &orig->dst.u3, orig->dst.u.tcp.port);
- if (cp) {
- /* Change reply VS->CLIENT to RS->CLIENT */
- new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
- FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
- __func__, ct, ct->status,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
- new_reply.src.u3 = cp->daddr;
- new_reply.src.u.tcp.port = cp->dport;
- IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", "
- FMT_TUPLE ", outin cp=" FMT_CONN "\n",
- __func__, ct,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
- goto alter;
- }
-
- IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuple=" FMT_TUPLE
- " - unknown expect\n",
- __func__, ct, ct->status, ARG_TUPLE(orig));
- return;
-
-alter:
- /* Never alter conntrack for non-NAT conns */
- if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
- nf_conntrack_alter_reply(ct, &new_reply);
- ip_vs_conn_put(cp);
- return;
-}
-
-/*
- * Create NF conntrack expectation with wildcard (optional) source port.
- * Then the default callback function will alter the reply and will confirm
- * the conntrack entry when the first packet comes.
- */
-static void
-ip_vs_expect_related(struct sk_buff *skb, struct nf_conn *ct,
- struct ip_vs_conn *cp, u_int8_t proto,
- const __be16 *port, int from_rs)
-{
- struct nf_conntrack_expect *exp;
-
- BUG_ON(!ct || ct == &nf_conntrack_untracked);
-
- exp = nf_ct_expect_alloc(ct);
- if (!exp)
- return;
-
- if (from_rs)
- nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
- nf_ct_l3num(ct), &cp->daddr, &cp->caddr,
- proto, port, &cp->cport);
- else
- nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
- nf_ct_l3num(ct), &cp->caddr, &cp->vaddr,
- proto, port, &cp->vport);
-
- exp->expectfn = ip_vs_expect_callback;
-
- IP_VS_DBG(7, "%s(): ct=%p, expect tuple=" FMT_TUPLE "\n",
- __func__, ct, ARG_TUPLE(&exp->tuple));
- nf_ct_expect_related(exp);
- nf_ct_expect_put(exp);
-}
-
-/*
* Look at outgoing ftp packets to catch the response to a PASV command
* from the server (inside-to-outside).
* When we see one, we build a connection entry with the client address,
@@ -327,14 +195,19 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
/*
* Now update or create an connection entry for it
*/
- n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port,
- &cp->caddr, 0);
+ {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(AF_INET, iph->protocol,
+ &from, port, &cp->caddr, 0, &p);
+ n_cp = ip_vs_conn_out_get(&p);
+ }
if (!n_cp) {
- n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
- &cp->caddr, 0,
- &cp->vaddr, port,
- &from, port,
- IP_VS_CONN_F_NO_CPORT,
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr,
+ 0, &cp->vaddr, port, &p);
+ n_cp = ip_vs_conn_new(&p, &from, port,
+ IP_VS_CONN_F_NO_CPORT |
+ IP_VS_CONN_F_NFCT,
cp->dest);
if (!n_cp)
return 0;
@@ -359,7 +232,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
buf_len = strlen(buf);
ct = nf_ct_get(skb, &ctinfo);
- if (ct && !nf_ct_is_untracked(ct)) {
+ if (ct && !nf_ct_is_untracked(ct) && nfct_nat(ct)) {
/* If mangling fails this function will return 0
* which will cause the packet to be dropped.
* Mangling can only fail under memory pressure,
@@ -369,9 +242,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
start-data, end-start,
buf, buf_len);
- if (ret)
- ip_vs_expect_related(skb, ct, n_cp,
- IPPROTO_TCP, NULL, 0);
+ if (ret) {
+ ip_vs_nfct_expect_related(skb, ct, n_cp,
+ IPPROTO_TCP, 0, 0);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ /* csum is updated */
+ ret = 1;
+ }
}
/*
@@ -409,7 +287,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
union nf_inet_addr to;
__be16 port;
struct ip_vs_conn *n_cp;
- struct nf_conn *ct;
#ifdef CONFIG_IP_VS_IPV6
/* This application helper doesn't work with IPv6 yet,
@@ -479,28 +356,24 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
ip_vs_proto_name(iph->protocol),
&to.ip, ntohs(port), &cp->vaddr.ip, 0);
- n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol,
- &to, port,
- &cp->vaddr, htons(ntohs(cp->vport)-1));
- if (!n_cp) {
- n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
- &to, port,
+ {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port,
&cp->vaddr, htons(ntohs(cp->vport)-1),
- &cp->daddr, htons(ntohs(cp->dport)-1),
- 0,
- cp->dest);
- if (!n_cp)
- return 0;
+ &p);
+ n_cp = ip_vs_conn_in_get(&p);
+ if (!n_cp) {
+ n_cp = ip_vs_conn_new(&p, &cp->daddr,
+ htons(ntohs(cp->dport)-1),
+ IP_VS_CONN_F_NFCT, cp->dest);
+ if (!n_cp)
+ return 0;
- /* add its controller */
- ip_vs_control_add(n_cp, cp);
+ /* add its controller */
+ ip_vs_control_add(n_cp, cp);
+ }
}
- ct = (struct nf_conn *)skb->nfct;
- if (ct && ct != &nf_conntrack_untracked)
- ip_vs_expect_related(skb, ct, n_cp,
- IPPROTO_TCP, &n_cp->dport, 1);
-
/*
* Move tunnel to listen state
*/
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
new file mode 100644
index 00000000000..4680647cd45
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -0,0 +1,292 @@
+/*
+ * ip_vs_nfct.c: Netfilter connection tracking support for IPVS
+ *
+ * Portions Copyright (C) 2001-2002
+ * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
+ *
+ * Portions Copyright (C) 2003-2010
+ * Julian Anastasov
+ *
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ * Authors:
+ * Ben North <ben@redfrontdoor.org>
+ * Julian Anastasov <ja@ssi.bg> Reorganize and sync with latest kernels
+ * Hannes Eder <heder@google.com> Extend NFCT support for FTP, ipvs match
+ *
+ *
+ * Current status:
+ *
+ * - provide conntrack confirmation for new and related connections, by
+ * this way we can see their proper conntrack state in all hooks
+ * - support for all forwarding methods, not only NAT
+ * - FTP support (NAT), ability to support other NAT apps with expectations
+ * - to correctly create expectations for related NAT connections the proper
+ * NF conntrack support must be already installed, eg. ip_vs_ftp requires
+ * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables
+ * NAT rules are needed)
+ * - alter reply for NAT when forwarding packet in original direction:
+ * conntrack from client in NEW or RELATED (Passive FTP DATA) state or
+ * when RELATED conntrack is created from real server (Active FTP DATA)
+ * - if iptables_nat is not loaded the Passive FTP will not work (the
+ * PASV response can not be NAT-ed) but Active FTP should work
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+
+#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u"
+#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \
+ &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+ (T)->dst.protonum
+
+#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
+#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \
+ &((C)->vaddr.ip), ntohs((C)->vport), \
+ &((C)->daddr.ip), ntohs((C)->dport), \
+ (C)->protocol, (C)->state
+
+void
+ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conntrack_tuple new_tuple;
+
+ if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) ||
+ nf_ct_is_dying(ct))
+ return;
+
+ /* Never alter conntrack for non-NAT conns */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return;
+
+ /* Alter reply only in original direction */
+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ return;
+
+ /*
+ * The connection is not yet in the hashtable, so we update it.
+ * CIP->VIP will remain the same, so leave the tuple in
+ * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the
+ * real-server we will see RIP->DIP.
+ */
+ new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ /*
+ * This will also take care of UDP and other protocols.
+ */
+ if (outin) {
+ new_tuple.src.u3 = cp->daddr;
+ if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+ new_tuple.dst.protonum != IPPROTO_ICMPV6)
+ new_tuple.src.u.tcp.port = cp->dport;
+ } else {
+ new_tuple.dst.u3 = cp->vaddr;
+ if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+ new_tuple.dst.protonum != IPPROTO_ICMPV6)
+ new_tuple.dst.u.tcp.port = cp->vport;
+ }
+ IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+ "ctinfo=%d, old reply=" FMT_TUPLE
+ ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
+ __func__, ct, ct->status, ctinfo,
+ ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
+ ARG_TUPLE(&new_tuple), ARG_CONN(cp));
+ nf_conntrack_alter_reply(ct, &new_tuple);
+}
+
+int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
+{
+ return nf_conntrack_confirm(skb);
+}
+
+/*
+ * Called from init_conntrack() as expectfn handler.
+ */
+static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp)
+{
+ struct nf_conntrack_tuple *orig, new_reply;
+ struct ip_vs_conn *cp;
+ struct ip_vs_conn_param p;
+
+ if (exp->tuple.src.l3num != PF_INET)
+ return;
+
+ /*
+ * We assume that no NF locks are held before this callback.
+ * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
+ * expectations even if they use wildcard values, now we provide the
+ * actual values from the newly created original conntrack direction.
+ * The conntrack is confirmed when packet reaches IPVS hooks.
+ */
+
+ /* RS->CLIENT */
+ orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum,
+ &orig->src.u3, orig->src.u.tcp.port,
+ &orig->dst.u3, orig->dst.u.tcp.port, &p);
+ cp = ip_vs_conn_out_get(&p);
+ if (cp) {
+ /* Change reply CLIENT->RS to CLIENT->VS */
+ new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+ FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
+ __func__, ct, ct->status,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ new_reply.dst.u3 = cp->vaddr;
+ new_reply.dst.u.tcp.port = cp->vport;
+ IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
+ ", inout cp=" FMT_CONN "\n",
+ __func__, ct,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ goto alter;
+ }
+
+ /* CLIENT->VS */
+ cp = ip_vs_conn_in_get(&p);
+ if (cp) {
+ /* Change reply VS->CLIENT to RS->CLIENT */
+ new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+ FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
+ __func__, ct, ct->status,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ new_reply.src.u3 = cp->daddr;
+ new_reply.src.u.tcp.port = cp->dport;
+ IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
+ FMT_TUPLE ", outin cp=" FMT_CONN "\n",
+ __func__, ct,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ goto alter;
+ }
+
+ IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+ " - unknown expect\n",
+ __func__, ct, ct->status, ARG_TUPLE(orig));
+ return;
+
+alter:
+ /* Never alter conntrack for non-NAT conns */
+ if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
+ nf_conntrack_alter_reply(ct, &new_reply);
+ ip_vs_conn_put(cp);
+ return;
+}
+
+/*
+ * Create NF conntrack expectation with wildcard (optional) source port.
+ * Then the default callback function will alter the reply and will confirm
+ * the conntrack entry when the first packet comes.
+ * Use port 0 to expect connection from any port.
+ */
+void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
+ struct ip_vs_conn *cp, u_int8_t proto,
+ const __be16 port, int from_rs)
+{
+ struct nf_conntrack_expect *exp;
+
+ if (ct == NULL || nf_ct_is_untracked(ct))
+ return;
+
+ exp = nf_ct_expect_alloc(ct);
+ if (!exp)
+ return;
+
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ from_rs ? &cp->daddr : &cp->caddr,
+ from_rs ? &cp->caddr : &cp->vaddr,
+ proto, port ? &port : NULL,
+ from_rs ? &cp->cport : &cp->vport);
+
+ exp->expectfn = ip_vs_nfct_expect_callback;
+
+ IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&exp->tuple));
+ nf_ct_expect_related(exp);
+ nf_ct_expect_put(exp);
+}
+EXPORT_SYMBOL(ip_vs_nfct_expect_related);
+
+/*
+ * Our connection was terminated, try to drop the conntrack immediately
+ */
+void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
+{
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ struct nf_conntrack_tuple tuple;
+
+ if (!cp->cport)
+ return;
+
+ tuple = (struct nf_conntrack_tuple) {
+ .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } };
+ tuple.src.u3 = cp->caddr;
+ tuple.src.u.all = cp->cport;
+ tuple.src.l3num = cp->af;
+ tuple.dst.u3 = cp->vaddr;
+ tuple.dst.u.all = cp->vport;
+
+ IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
+ " for conn " FMT_CONN "\n",
+ __func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
+
+ h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple);
+ if (h) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ /* Show what happens instead of calling nf_ct_kill() */
+ if (del_timer(&ct->timeout)) {
+ IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&tuple));
+ if (ct->timeout.function)
+ ct->timeout.function(ct->timeout.data);
+ } else {
+ IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&tuple));
+ }
+ nf_ct_put(ct);
+ } else {
+ IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
+ __func__, ARG_TUPLE(&tuple));
+ }
+}
+
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
new file mode 100644
index 00000000000..3414af70ee1
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -0,0 +1,147 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+/* IPVS pe list */
+static LIST_HEAD(ip_vs_pe);
+
+/* lock for service table */
+static DEFINE_SPINLOCK(ip_vs_pe_lock);
+
+/* Bind a service with a pe */
+void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe)
+{
+ svc->pe = pe;
+}
+
+/* Unbind a service from its pe */
+void ip_vs_unbind_pe(struct ip_vs_service *svc)
+{
+ svc->pe = NULL;
+}
+
+/* Get pe in the pe list by name */
+static struct ip_vs_pe *
+ip_vs_pe_getbyname(const char *pe_name)
+{
+ struct ip_vs_pe *pe;
+
+ IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__,
+ pe_name);
+
+ spin_lock_bh(&ip_vs_pe_lock);
+
+ list_for_each_entry(pe, &ip_vs_pe, n_list) {
+ /* Test and get the modules atomically */
+ if (pe->module &&
+ !try_module_get(pe->module)) {
+ /* This pe is just deleted */
+ continue;
+ }
+ if (strcmp(pe_name, pe->name)==0) {
+ /* HIT */
+ spin_unlock_bh(&ip_vs_pe_lock);
+ return pe;
+ }
+ if (pe->module)
+ module_put(pe->module);
+ }
+
+ spin_unlock_bh(&ip_vs_pe_lock);
+ return NULL;
+}
+
+/* Lookup pe and try to load it if it doesn't exist */
+struct ip_vs_pe *ip_vs_pe_get(const char *name)
+{
+ struct ip_vs_pe *pe;
+
+ /* Search for the pe by name */
+ pe = ip_vs_pe_getbyname(name);
+
+ /* If pe not found, load the module and search again */
+ if (!pe) {
+ request_module("ip_vs_pe_%s", name);
+ pe = ip_vs_pe_getbyname(name);
+ }
+
+ return pe;
+}
+
+void ip_vs_pe_put(struct ip_vs_pe *pe)
+{
+ if (pe && pe->module)
+ module_put(pe->module);
+}
+
+/* Register a pe in the pe list */
+int register_ip_vs_pe(struct ip_vs_pe *pe)
+{
+ struct ip_vs_pe *tmp;
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ spin_lock_bh(&ip_vs_pe_lock);
+
+ if (!list_empty(&pe->n_list)) {
+ spin_unlock_bh(&ip_vs_pe_lock);
+ ip_vs_use_count_dec();
+ pr_err("%s(): [%s] pe already linked\n",
+ __func__, pe->name);
+ return -EINVAL;
+ }
+
+ /* Make sure that the pe with this name doesn't exist
+ * in the pe list.
+ */
+ list_for_each_entry(tmp, &ip_vs_pe, n_list) {
+ if (strcmp(tmp->name, pe->name) == 0) {
+ spin_unlock_bh(&ip_vs_pe_lock);
+ ip_vs_use_count_dec();
+ pr_err("%s(): [%s] pe already existed "
+ "in the system\n", __func__, pe->name);
+ return -EINVAL;
+ }
+ }
+ /* Add it into the d-linked pe list */
+ list_add(&pe->n_list, &ip_vs_pe);
+ spin_unlock_bh(&ip_vs_pe_lock);
+
+ pr_info("[%s] pe registered.\n", pe->name);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_ip_vs_pe);
+
+/* Unregister a pe from the pe list */
+int unregister_ip_vs_pe(struct ip_vs_pe *pe)
+{
+ spin_lock_bh(&ip_vs_pe_lock);
+ if (list_empty(&pe->n_list)) {
+ spin_unlock_bh(&ip_vs_pe_lock);
+ pr_err("%s(): [%s] pe is not in the list. failed\n",
+ __func__, pe->name);
+ return -EINVAL;
+ }
+
+ /* Remove it from the d-linked pe list */
+ list_del(&pe->n_list);
+ spin_unlock_bh(&ip_vs_pe_lock);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ pr_info("[%s] pe unregistered.\n", pe->name);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ip_vs_pe);
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
new file mode 100644
index 00000000000..b8b4e9620f3
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -0,0 +1,169 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+#ifdef CONFIG_IP_VS_DEBUG
+static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,
+ const char *callid, size_t callid_len,
+ int *idx)
+{
+ size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1);
+ memcpy(buf + *idx, callid, len);
+ buf[*idx+len] = '\0';
+ *idx += len + 1;
+ return buf + *idx - len;
+}
+
+#define IP_VS_DEBUG_CALLID(callid, len) \
+ ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf), \
+ callid, len, &ip_vs_dbg_idx)
+#endif
+
+static int get_callid(const char *dptr, unsigned int dataoff,
+ unsigned int datalen,
+ unsigned int *matchoff, unsigned int *matchlen)
+{
+ /* Find callid */
+ while (1) {
+ int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen,
+ SIP_HDR_CALL_ID, matchoff,
+ matchlen);
+ if (ret > 0)
+ break;
+ if (!ret)
+ return 0;
+ dataoff += *matchoff;
+ }
+
+ /* Empty callid is useless */
+ if (!*matchlen)
+ return -EINVAL;
+
+ /* Too large is useless */
+ if (*matchlen > IP_VS_PEDATA_MAXLEN)
+ return -EINVAL;
+
+ /* SIP headers are always followed by a line terminator */
+ if (*matchoff + *matchlen == datalen)
+ return -EINVAL;
+
+ /* RFC 2543 allows lines to be terminated with CR, LF or CRLF,
+ * RFC 3261 allows only CRLF, we support both. */
+ if (*(dptr + *matchoff + *matchlen) != '\r' &&
+ *(dptr + *matchoff + *matchlen) != '\n')
+ return -EINVAL;
+
+ IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n",
+ IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen),
+ *matchlen);
+ return 0;
+}
+
+static int
+ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
+{
+ struct ip_vs_iphdr iph;
+ unsigned int dataoff, datalen, matchoff, matchlen;
+ const char *dptr;
+
+ ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph);
+
+ /* Only useful with UDP */
+ if (iph.protocol != IPPROTO_UDP)
+ return -EINVAL;
+
+ /* No Data ? */
+ dataoff = iph.len + sizeof(struct udphdr);
+ if (dataoff >= skb->len)
+ return -EINVAL;
+
+ dptr = skb->data + dataoff;
+ datalen = skb->len - dataoff;
+
+ if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
+ return -EINVAL;
+
+ p->pe_data = kmalloc(matchlen, GFP_ATOMIC);
+ if (!p->pe_data)
+ return -ENOMEM;
+
+ /* N.B: pe_data is only set on success,
+ * this allows fallback to the default persistence logic on failure
+ */
+ memcpy(p->pe_data, dptr + matchoff, matchlen);
+ p->pe_data_len = matchlen;
+
+ return 0;
+}
+
+static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
+ struct ip_vs_conn *ct)
+
+{
+ bool ret = 0;
+
+ if (ct->af == p->af &&
+ ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) &&
+ /* protocol should only be IPPROTO_IP if
+ * d_addr is a fwmark */
+ ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+ p->vaddr, &ct->vaddr) &&
+ ct->vport == p->vport &&
+ ct->flags & IP_VS_CONN_F_TEMPLATE &&
+ ct->protocol == p->protocol &&
+ ct->pe_data && ct->pe_data_len == p->pe_data_len &&
+ !memcmp(ct->pe_data, p->pe_data, p->pe_data_len))
+ ret = 1;
+
+ IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n",
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+ ret ? "hit" : "not hit");
+
+ return ret;
+}
+
+static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p,
+ u32 initval, bool inverse)
+{
+ return jhash(p->pe_data, p->pe_data_len, initval);
+}
+
+static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
+{
+ memcpy(buf, cp->pe_data, cp->pe_data_len);
+ return cp->pe_data_len;
+}
+
+static struct ip_vs_pe ip_vs_sip_pe =
+{
+ .name = "sip",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_sip_pe.n_list),
+ .fill_param = ip_vs_sip_fill_param,
+ .ct_match = ip_vs_sip_ct_match,
+ .hashkey_raw = ip_vs_sip_hashkey_raw,
+ .show_pe_data = ip_vs_sip_show_pe_data,
+};
+
+static int __init ip_vs_sip_init(void)
+{
+ return register_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+static void __exit ip_vs_sip_cleanup(void)
+{
+ unregister_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+module_init(ip_vs_sip_init);
+module_exit(ip_vs_sip_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 027f654799f..c5399839087 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -172,8 +172,8 @@ ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
else if (ih->frag_off & htons(IP_OFFSET))
sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr);
else {
- __be16 _ports[2], *pptr
-;
+ __be16 _ports[2], *pptr;
+
pptr = skb_header_pointer(skb, offset + ih->ihl*4,
sizeof(_ports), _ports);
if (pptr == NULL)
@@ -223,13 +223,13 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
void
-ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
+ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
const struct sk_buff *skb,
int offset,
const char *msg)
{
#ifdef CONFIG_IP_VS_IPV6
- if (skb->protocol == htons(ETH_P_IPV6))
+ if (af == AF_INET6)
ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
else
#endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 1892dfc12fd..3a0461117d3 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -40,6 +40,19 @@ struct isakmp_hdr {
#define PORT_ISAKMP 500
+static void
+ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph,
+ int inverse, struct ip_vs_conn_param *p)
+{
+ if (likely(!inverse))
+ ip_vs_conn_fill_param(af, IPPROTO_UDP,
+ &iph->saddr, htons(PORT_ISAKMP),
+ &iph->daddr, htons(PORT_ISAKMP), p);
+ else
+ ip_vs_conn_fill_param(af, IPPROTO_UDP,
+ &iph->daddr, htons(PORT_ISAKMP),
+ &iph->saddr, htons(PORT_ISAKMP), p);
+}
static struct ip_vs_conn *
ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
@@ -47,21 +60,10 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
int inverse)
{
struct ip_vs_conn *cp;
+ struct ip_vs_conn_param p;
- if (likely(!inverse)) {
- cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
- &iph->saddr,
- htons(PORT_ISAKMP),
- &iph->daddr,
- htons(PORT_ISAKMP));
- } else {
- cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
- &iph->daddr,
- htons(PORT_ISAKMP),
- &iph->saddr,
- htons(PORT_ISAKMP));
- }
-
+ ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+ cp = ip_vs_conn_in_get(&p);
if (!cp) {
/*
* We are not sure if the packet is from our
@@ -87,21 +89,10 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
int inverse)
{
struct ip_vs_conn *cp;
+ struct ip_vs_conn_param p;
- if (likely(!inverse)) {
- cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
- &iph->saddr,
- htons(PORT_ISAKMP),
- &iph->daddr,
- htons(PORT_ISAKMP));
- } else {
- cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
- &iph->daddr,
- htons(PORT_ISAKMP),
- &iph->saddr,
- htons(PORT_ISAKMP));
- }
-
+ ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+ cp = ip_vs_conn_out_get(&p);
if (!cp) {
IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
"%s%s %s->%s\n",
@@ -126,54 +117,6 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
}
-
-static void
-ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
- int offset, const char *msg)
-{
- char buf[256];
- struct iphdr _iph, *ih;
-
- ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
- if (ih == NULL)
- sprintf(buf, "TRUNCATED");
- else
- sprintf(buf, "%pI4->%pI4", &ih->saddr, &ih->daddr);
-
- pr_debug("%s: %s %s\n", msg, pp->name, buf);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static void
-ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
- int offset, const char *msg)
-{
- char buf[256];
- struct ipv6hdr _iph, *ih;
-
- ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
- if (ih == NULL)
- sprintf(buf, "TRUNCATED");
- else
- sprintf(buf, "%pI6->%pI6", &ih->saddr, &ih->daddr);
-
- pr_debug("%s: %s %s\n", msg, pp->name, buf);
-}
-#endif
-
-static void
-ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
- int offset, const char *msg)
-{
-#ifdef CONFIG_IP_VS_IPV6
- if (skb->protocol == htons(ETH_P_IPV6))
- ah_esp_debug_packet_v6(pp, skb, offset, msg);
- else
-#endif
- ah_esp_debug_packet_v4(pp, skb, offset, msg);
-}
-
-
static void ah_esp_init(struct ip_vs_protocol *pp)
{
/* nothing to do now */
@@ -204,7 +147,7 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
.register_app = NULL,
.unregister_app = NULL,
.app_conn_bind = NULL,
- .debug_packet = ah_esp_debug_packet,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = NULL, /* ISAKMP */
.set_state_timeout = NULL,
};
@@ -228,7 +171,7 @@ struct ip_vs_protocol ip_vs_protocol_esp = {
.register_app = NULL,
.unregister_app = NULL,
.app_conn_bind = NULL,
- .debug_packet = ah_esp_debug_packet,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = NULL, /* ISAKMP */
};
#endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 4c0855cb006..1ea96bcd342 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -31,6 +31,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
if ((sch->type == SCTP_CID_INIT) &&
(svc = ip_vs_service_get(af, skb->mark, iph.protocol,
&iph.daddr, sh->dest))) {
+ int ignored;
+
if (ip_vs_todrop()) {
/*
* It seems that we are very loaded.
@@ -44,8 +46,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb);
- if (!*cpp) {
+ *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+ if (!*cpp && !ignored) {
*verdict = ip_vs_leave(svc, skb, pp);
return 0;
}
@@ -61,6 +63,7 @@ sctp_snat_handler(struct sk_buff *skb,
{
sctp_sctphdr_t *sctph;
unsigned int sctphoff;
+ struct sk_buff *iter;
__be32 crc32;
#ifdef CONFIG_IP_VS_IPV6
@@ -89,8 +92,8 @@ sctp_snat_handler(struct sk_buff *skb,
/* Calculate the checksum */
crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff);
- for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next)
- crc32 = sctp_update_cksum((u8 *) skb->data, skb_headlen(skb),
+ skb_walk_frags(skb, iter)
+ crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter),
crc32);
crc32 = sctp_end_cksum(crc32);
sctph->checksum = crc32;
@@ -102,9 +105,9 @@ static int
sctp_dnat_handler(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
{
-
sctp_sctphdr_t *sctph;
unsigned int sctphoff;
+ struct sk_buff *iter;
__be32 crc32;
#ifdef CONFIG_IP_VS_IPV6
@@ -133,8 +136,8 @@ sctp_dnat_handler(struct sk_buff *skb,
/* Calculate the checksum */
crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff);
- for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next)
- crc32 = sctp_update_cksum((u8 *) skb->data, skb_headlen(skb),
+ skb_walk_frags(skb, iter)
+ crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter),
crc32);
crc32 = sctp_end_cksum(crc32);
sctph->checksum = crc32;
@@ -145,9 +148,9 @@ sctp_dnat_handler(struct sk_buff *skb,
static int
sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
{
- struct sk_buff *list = skb_shinfo(skb)->frag_list;
unsigned int sctphoff;
struct sctphdr *sh, _sctph;
+ struct sk_buff *iter;
__le32 cmp;
__le32 val;
__u32 tmp;
@@ -166,15 +169,15 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
cmp = sh->checksum;
tmp = sctp_start_cksum((__u8 *) sh, skb_headlen(skb));
- for (; list; list = list->next)
- tmp = sctp_update_cksum((__u8 *) list->data,
- skb_headlen(list), tmp);
+ skb_walk_frags(skb, iter)
+ tmp = sctp_update_cksum((__u8 *) iter->data,
+ skb_headlen(iter), tmp);
val = sctp_end_cksum(tmp);
if (val != cmp) {
/* CRC failure, dump it. */
- IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
"Failed checksum for");
return 0;
}
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 282d24de859..f6c5200e214 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -43,9 +43,12 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
}
+ /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
if (th->syn &&
(svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
th->dest))) {
+ int ignored;
+
if (ip_vs_todrop()) {
/*
* It seems that we are very loaded.
@@ -60,8 +63,8 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb);
- if (!*cpp) {
+ *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+ if (!*cpp && !ignored) {
*verdict = ip_vs_leave(svc, skb, pp);
return 0;
}
@@ -101,15 +104,15 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph,
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
tcph->check =
- csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+ ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
ip_vs_check_diff2(oldlen, newlen,
- ~csum_unfold(tcph->check))));
+ csum_unfold(tcph->check))));
else
#endif
tcph->check =
- csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+ ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
ip_vs_check_diff2(oldlen, newlen,
- ~csum_unfold(tcph->check))));
+ csum_unfold(tcph->check))));
}
@@ -120,6 +123,7 @@ tcp_snat_handler(struct sk_buff *skb,
struct tcphdr *tcph;
unsigned int tcphoff;
int oldlen;
+ int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
@@ -134,13 +138,20 @@ tcp_snat_handler(struct sk_buff *skb,
return 0;
if (unlikely(cp->app != NULL)) {
+ int ret;
+
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
/* Call application helper if needed */
- if (!ip_vs_app_pkt_out(cp, skb))
+ if (!(ret = ip_vs_app_pkt_out(cp, skb)))
return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - tcphoff;
+ else
+ payload_csum = 1;
}
tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -151,12 +162,13 @@ tcp_snat_handler(struct sk_buff *skb,
tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
htons(oldlen),
htons(skb->len - tcphoff));
- } else if (!cp->app) {
+ } else if (!payload_csum) {
/* Only port and addr are changed, do fast csum update */
tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
cp->dport, cp->vport);
if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->ip_summed = CHECKSUM_NONE;
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
} else {
/* full checksum calculation */
tcph->check = 0;
@@ -174,6 +186,7 @@ tcp_snat_handler(struct sk_buff *skb,
skb->len - tcphoff,
cp->protocol,
skb->csum);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
pp->name, tcph->check,
@@ -190,6 +203,7 @@ tcp_dnat_handler(struct sk_buff *skb,
struct tcphdr *tcph;
unsigned int tcphoff;
int oldlen;
+ int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
@@ -204,6 +218,8 @@ tcp_dnat_handler(struct sk_buff *skb,
return 0;
if (unlikely(cp->app != NULL)) {
+ int ret;
+
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
@@ -212,8 +228,13 @@ tcp_dnat_handler(struct sk_buff *skb,
* Attempt ip_vs_app call.
* It will fix ip_vs_conn and iph ack_seq stuff
*/
- if (!ip_vs_app_pkt_in(cp, skb))
+ if (!(ret = ip_vs_app_pkt_in(cp, skb)))
return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - tcphoff;
+ else
+ payload_csum = 1;
}
tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -223,15 +244,16 @@ tcp_dnat_handler(struct sk_buff *skb,
* Adjust TCP checksums
*/
if (skb->ip_summed == CHECKSUM_PARTIAL) {
- tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+ tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
htons(oldlen),
htons(skb->len - tcphoff));
- } else if (!cp->app) {
+ } else if (!payload_csum) {
/* Only port and addr are changed, do fast csum update */
tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
cp->vport, cp->dport);
if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->ip_summed = CHECKSUM_NONE;
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
} else {
/* full checksum calculation */
tcph->check = 0;
@@ -278,7 +300,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
skb->len - tcphoff,
ipv6_hdr(skb)->nexthdr,
skb->csum)) {
- IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
"Failed checksum for");
return 0;
}
@@ -289,7 +311,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
skb->len - tcphoff,
ip_hdr(skb)->protocol,
skb->csum)) {
- IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
"Failed checksum for");
return 0;
}
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 8553231b5d4..9d106a06bb0 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -46,6 +46,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
svc = ip_vs_service_get(af, skb->mark, iph.protocol,
&iph.daddr, uh->dest);
if (svc) {
+ int ignored;
+
if (ip_vs_todrop()) {
/*
* It seems that we are very loaded.
@@ -60,8 +62,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
- *cpp = ip_vs_schedule(svc, skb);
- if (!*cpp) {
+ *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+ if (!*cpp && !ignored) {
*verdict = ip_vs_leave(svc, skb, pp);
return 0;
}
@@ -102,15 +104,15 @@ udp_partial_csum_update(int af, struct udphdr *uhdr,
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
uhdr->check =
- csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+ ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
ip_vs_check_diff2(oldlen, newlen,
- ~csum_unfold(uhdr->check))));
+ csum_unfold(uhdr->check))));
else
#endif
uhdr->check =
- csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+ ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
ip_vs_check_diff2(oldlen, newlen,
- ~csum_unfold(uhdr->check))));
+ csum_unfold(uhdr->check))));
}
@@ -121,6 +123,7 @@ udp_snat_handler(struct sk_buff *skb,
struct udphdr *udph;
unsigned int udphoff;
int oldlen;
+ int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
@@ -135,6 +138,8 @@ udp_snat_handler(struct sk_buff *skb,
return 0;
if (unlikely(cp->app != NULL)) {
+ int ret;
+
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
@@ -142,8 +147,13 @@ udp_snat_handler(struct sk_buff *skb,
/*
* Call application helper if needed
*/
- if (!ip_vs_app_pkt_out(cp, skb))
+ if (!(ret = ip_vs_app_pkt_out(cp, skb)))
return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - udphoff;
+ else
+ payload_csum = 1;
}
udph = (void *)skb_network_header(skb) + udphoff;
@@ -156,12 +166,13 @@ udp_snat_handler(struct sk_buff *skb,
udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
htons(oldlen),
htons(skb->len - udphoff));
- } else if (!cp->app && (udph->check != 0)) {
+ } else if (!payload_csum && (udph->check != 0)) {
/* Only port and addr are changed, do fast csum update */
udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
cp->dport, cp->vport);
if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->ip_summed = CHECKSUM_NONE;
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
} else {
/* full checksum calculation */
udph->check = 0;
@@ -181,6 +192,7 @@ udp_snat_handler(struct sk_buff *skb,
skb->csum);
if (udph->check == 0)
udph->check = CSUM_MANGLED_0;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
pp->name, udph->check,
(char*)&(udph->check) - (char*)udph);
@@ -196,6 +208,7 @@ udp_dnat_handler(struct sk_buff *skb,
struct udphdr *udph;
unsigned int udphoff;
int oldlen;
+ int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
@@ -210,6 +223,8 @@ udp_dnat_handler(struct sk_buff *skb,
return 0;
if (unlikely(cp->app != NULL)) {
+ int ret;
+
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
@@ -218,8 +233,13 @@ udp_dnat_handler(struct sk_buff *skb,
* Attempt ip_vs_app call.
* It will fix ip_vs_conn
*/
- if (!ip_vs_app_pkt_in(cp, skb))
+ if (!(ret = ip_vs_app_pkt_in(cp, skb)))
return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - udphoff;
+ else
+ payload_csum = 1;
}
udph = (void *)skb_network_header(skb) + udphoff;
@@ -229,15 +249,16 @@ udp_dnat_handler(struct sk_buff *skb,
* Adjust UDP checksums
*/
if (skb->ip_summed == CHECKSUM_PARTIAL) {
- udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+ udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
htons(oldlen),
htons(skb->len - udphoff));
- } else if (!cp->app && (udph->check != 0)) {
+ } else if (!payload_csum && (udph->check != 0)) {
/* Only port and addr are changed, do fast csum update */
udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
cp->vport, cp->dport);
if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->ip_summed = CHECKSUM_NONE;
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
} else {
/* full checksum calculation */
udph->check = 0;
@@ -293,7 +314,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
skb->len - udphoff,
ipv6_hdr(skb)->nexthdr,
skb->csum)) {
- IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
"Failed checksum for");
return 0;
}
@@ -304,7 +325,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
skb->len - udphoff,
ip_hdr(skb)->protocol,
skb->csum)) {
- IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
"Failed checksum for");
return 0;
}
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index bbc1ac79595..076ebe00435 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -35,7 +35,7 @@
static LIST_HEAD(ip_vs_schedulers);
/* lock for service table */
-static DEFINE_RWLOCK(__ip_vs_sched_lock);
+static DEFINE_SPINLOCK(ip_vs_sched_lock);
/*
@@ -46,15 +46,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
{
int ret;
- if (svc == NULL) {
- pr_err("%s(): svc arg NULL\n", __func__);
- return -EINVAL;
- }
- if (scheduler == NULL) {
- pr_err("%s(): scheduler arg NULL\n", __func__);
- return -EINVAL;
- }
-
svc->scheduler = scheduler;
if (scheduler->init_service) {
@@ -74,18 +65,10 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
*/
int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
{
- struct ip_vs_scheduler *sched;
+ struct ip_vs_scheduler *sched = svc->scheduler;
- if (svc == NULL) {
- pr_err("%s(): svc arg NULL\n", __func__);
- return -EINVAL;
- }
-
- sched = svc->scheduler;
- if (sched == NULL) {
- pr_err("%s(): svc isn't bound\n", __func__);
- return -EINVAL;
- }
+ if (!sched)
+ return 0;
if (sched->done_service) {
if (sched->done_service(svc) != 0) {
@@ -108,7 +91,7 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name);
- read_lock_bh(&__ip_vs_sched_lock);
+ spin_lock_bh(&ip_vs_sched_lock);
list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
/*
@@ -122,14 +105,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
}
if (strcmp(sched_name, sched->name)==0) {
/* HIT */
- read_unlock_bh(&__ip_vs_sched_lock);
+ spin_unlock_bh(&ip_vs_sched_lock);
return sched;
}
if (sched->module)
module_put(sched->module);
}
- read_unlock_bh(&__ip_vs_sched_lock);
+ spin_unlock_bh(&ip_vs_sched_lock);
return NULL;
}
@@ -159,7 +142,7 @@ struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
{
- if (scheduler->module)
+ if (scheduler && scheduler->module)
module_put(scheduler->module);
}
@@ -184,10 +167,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
/* increase the module use count */
ip_vs_use_count_inc();
- write_lock_bh(&__ip_vs_sched_lock);
+ spin_lock_bh(&ip_vs_sched_lock);
if (!list_empty(&scheduler->n_list)) {
- write_unlock_bh(&__ip_vs_sched_lock);
+ spin_unlock_bh(&ip_vs_sched_lock);
ip_vs_use_count_dec();
pr_err("%s(): [%s] scheduler already linked\n",
__func__, scheduler->name);
@@ -200,7 +183,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
*/
list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
if (strcmp(scheduler->name, sched->name) == 0) {
- write_unlock_bh(&__ip_vs_sched_lock);
+ spin_unlock_bh(&ip_vs_sched_lock);
ip_vs_use_count_dec();
pr_err("%s(): [%s] scheduler already existed "
"in the system\n", __func__, scheduler->name);
@@ -211,7 +194,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
* Add it into the d-linked scheduler list
*/
list_add(&scheduler->n_list, &ip_vs_schedulers);
- write_unlock_bh(&__ip_vs_sched_lock);
+ spin_unlock_bh(&ip_vs_sched_lock);
pr_info("[%s] scheduler registered.\n", scheduler->name);
@@ -229,9 +212,9 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
return -EINVAL;
}
- write_lock_bh(&__ip_vs_sched_lock);
+ spin_lock_bh(&ip_vs_sched_lock);
if (list_empty(&scheduler->n_list)) {
- write_unlock_bh(&__ip_vs_sched_lock);
+ spin_unlock_bh(&ip_vs_sched_lock);
pr_err("%s(): [%s] scheduler is not in the list. failed\n",
__func__, scheduler->name);
return -EINVAL;
@@ -241,7 +224,7 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
* Remove it from the d-linked scheduler list
*/
list_del(&scheduler->n_list);
- write_unlock_bh(&__ip_vs_sched_lock);
+ spin_unlock_bh(&ip_vs_sched_lock);
/* decrease the module use count */
ip_vs_use_count_dec();
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 7ba06939829..ab85aedea17 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -288,6 +288,16 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
ip_vs_sync_conn(cp->control);
}
+static inline int
+ip_vs_conn_fill_param_sync(int af, int protocol,
+ const union nf_inet_addr *caddr, __be16 cport,
+ const union nf_inet_addr *vaddr, __be16 vport,
+ struct ip_vs_conn_param *p)
+{
+ /* XXX: Need to take into account persistence engine */
+ ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p);
+ return 0;
+}
/*
* Process received multicast message and create the corresponding
@@ -301,6 +311,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
struct ip_vs_dest *dest;
+ struct ip_vs_conn_param param;
char *p;
int i;
@@ -370,18 +381,20 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
}
}
- if (!(flags & IP_VS_CONN_F_TEMPLATE))
- cp = ip_vs_conn_in_get(AF_INET, s->protocol,
- (union nf_inet_addr *)&s->caddr,
- s->cport,
- (union nf_inet_addr *)&s->vaddr,
- s->vport);
- else
- cp = ip_vs_ct_in_get(AF_INET, s->protocol,
- (union nf_inet_addr *)&s->caddr,
- s->cport,
- (union nf_inet_addr *)&s->vaddr,
- s->vport);
+ {
+ if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
+ (union nf_inet_addr *)&s->caddr,
+ s->cport,
+ (union nf_inet_addr *)&s->vaddr,
+ s->vport, &param)) {
+ pr_err("ip_vs_conn_fill_param_sync failed");
+ return;
+ }
+ if (!(flags & IP_VS_CONN_F_TEMPLATE))
+ cp = ip_vs_conn_in_get(&param);
+ else
+ cp = ip_vs_ct_in_get(&param);
+ }
if (!cp) {
/*
* Find the appropriate destination for the connection.
@@ -406,14 +419,9 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
else
flags &= ~IP_VS_CONN_F_INACTIVE;
}
- cp = ip_vs_conn_new(AF_INET, s->protocol,
- (union nf_inet_addr *)&s->caddr,
- s->cport,
- (union nf_inet_addr *)&s->vaddr,
- s->vport,
+ cp = ip_vs_conn_new(&param,
(union nf_inet_addr *)&s->daddr,
- s->dport,
- flags, dest);
+ s->dport, flags, dest);
if (dest)
atomic_dec(&dest->refcnt);
if (!cp) {
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 21e1a5e9b9d..de04ea39cde 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -11,6 +11,16 @@
*
* Changes:
*
+ * Description of forwarding methods:
+ * - all transmitters are called from LOCAL_IN (remote clients) and
+ * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
+ * - not all connections have destination server, for example,
+ * connections in backup server when fwmark is used
+ * - bypass connections use daddr from packet
+ * LOCAL_OUT rules:
+ * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
+ * - skb->pkt_type is not set yet
+ * - the only place where we can see skb->sk != NULL
*/
#define KMSG_COMPONENT "IPVS"
@@ -26,9 +36,9 @@
#include <net/route.h> /* for ip_route_output */
#include <net/ipv6.h>
#include <net/ip6_route.h>
+#include <net/addrconf.h>
#include <linux/icmpv6.h>
#include <linux/netfilter.h>
-#include <net/netfilter/nf_conntrack.h>
#include <linux/netfilter_ipv4.h>
#include <net/ip_vs.h>
@@ -38,26 +48,27 @@
* Destination cache to speed up outgoing route lookup
*/
static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
+ u32 dst_cookie)
{
struct dst_entry *old_dst;
old_dst = dest->dst_cache;
dest->dst_cache = dst;
dest->dst_rtos = rtos;
+ dest->dst_cookie = dst_cookie;
dst_release(old_dst);
}
static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
{
struct dst_entry *dst = dest->dst_cache;
if (!dst)
return NULL;
- if ((dst->obsolete
- || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
- dst->ops->check(dst, cookie) == NULL) {
+ if ((dst->obsolete || rtos != dest->dst_rtos) &&
+ dst->ops->check(dst, dest->dst_cookie) == NULL) {
dest->dst_cache = NULL;
dst_release(dst);
return NULL;
@@ -66,16 +77,24 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
return dst;
}
+/*
+ * Get route to destination or remote server
+ * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
+ * &4=Allow redirect from remote daddr to local
+ */
static struct rtable *
-__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
+__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
+ __be32 daddr, u32 rtos, int rt_mode)
{
+ struct net *net = dev_net(skb_dst(skb)->dev);
struct rtable *rt; /* Route to the other host */
- struct ip_vs_dest *dest = cp->dest;
+ struct rtable *ort; /* Original route */
+ int local;
if (dest) {
spin_lock(&dest->dst_lock);
if (!(rt = (struct rtable *)
- __ip_vs_dst_check(dest, rtos, 0))) {
+ __ip_vs_dst_check(dest, rtos))) {
struct flowi fl = {
.oif = 0,
.nl_u = {
@@ -85,13 +104,13 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
.tos = rtos, } },
};
- if (ip_route_output_key(&init_net, &rt, &fl)) {
+ if (ip_route_output_key(net, &rt, &fl)) {
spin_unlock(&dest->dst_lock);
IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
&dest->addr.ip);
return NULL;
}
- __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst));
+ __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
&dest->addr.ip,
atomic_read(&rt->dst.__refcnt), rtos);
@@ -102,78 +121,199 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
.oif = 0,
.nl_u = {
.ip4_u = {
- .daddr = cp->daddr.ip,
+ .daddr = daddr,
.saddr = 0,
.tos = rtos, } },
};
- if (ip_route_output_key(&init_net, &rt, &fl)) {
+ if (ip_route_output_key(net, &rt, &fl)) {
IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
- &cp->daddr.ip);
+ &daddr);
return NULL;
}
}
+ local = rt->rt_flags & RTCF_LOCAL;
+ if (!((local ? 1 : 2) & rt_mode)) {
+ IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
+ (rt->rt_flags & RTCF_LOCAL) ?
+ "local":"non-local", &rt->rt_dst);
+ ip_rt_put(rt);
+ return NULL;
+ }
+ if (local && !(rt_mode & 4) && !((ort = skb_rtable(skb)) &&
+ ort->rt_flags & RTCF_LOCAL)) {
+ IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
+ "requires NAT method, dest: %pI4\n",
+ &ip_hdr(skb)->daddr, &rt->rt_dst);
+ ip_rt_put(rt);
+ return NULL;
+ }
+ if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) {
+ IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 "
+ "to non-local address, dest: %pI4\n",
+ &ip_hdr(skb)->saddr, &rt->rt_dst);
+ ip_rt_put(rt);
+ return NULL;
+ }
+
return rt;
}
+/* Reroute packet to local IPv4 stack after DNAT */
+static int
+__ip_vs_reroute_locally(struct sk_buff *skb)
+{
+ struct rtable *rt = skb_rtable(skb);
+ struct net_device *dev = rt->dst.dev;
+ struct net *net = dev_net(dev);
+ struct iphdr *iph = ip_hdr(skb);
+
+ if (rt->fl.iif) {
+ unsigned long orefdst = skb->_skb_refdst;
+
+ if (ip_route_input(skb, iph->daddr, iph->saddr,
+ iph->tos, skb->dev))
+ return 0;
+ refdst_drop(orefdst);
+ } else {
+ struct flowi fl = {
+ .oif = 0,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .tos = RT_TOS(iph->tos),
+ }
+ },
+ .mark = skb->mark,
+ };
+ struct rtable *rt;
+
+ if (ip_route_output_key(net, &rt, &fl))
+ return 0;
+ if (!(rt->rt_flags & RTCF_LOCAL)) {
+ ip_rt_put(rt);
+ return 0;
+ }
+ /* Drop old route. */
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ }
+ return 1;
+}
+
#ifdef CONFIG_IP_VS_IPV6
+
+static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
+{
+ return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK;
+}
+
+static struct dst_entry *
+__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
+ struct in6_addr *ret_saddr, int do_xfrm)
+{
+ struct dst_entry *dst;
+ struct flowi fl = {
+ .oif = 0,
+ .nl_u = {
+ .ip6_u = {
+ .daddr = *daddr,
+ },
+ },
+ };
+
+ dst = ip6_route_output(net, NULL, &fl);
+ if (dst->error)
+ goto out_err;
+ if (!ret_saddr)
+ return dst;
+ if (ipv6_addr_any(&fl.fl6_src) &&
+ ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+ &fl.fl6_dst, 0, &fl.fl6_src) < 0)
+ goto out_err;
+ if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
+ goto out_err;
+ ipv6_addr_copy(ret_saddr, &fl.fl6_src);
+ return dst;
+
+out_err:
+ dst_release(dst);
+ IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
+ return NULL;
+}
+
+/*
+ * Get route to destination or remote server
+ * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
+ * &4=Allow redirect from remote daddr to local
+ */
static struct rt6_info *
-__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
+__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
+ struct in6_addr *daddr, struct in6_addr *ret_saddr,
+ int do_xfrm, int rt_mode)
{
+ struct net *net = dev_net(skb_dst(skb)->dev);
struct rt6_info *rt; /* Route to the other host */
- struct ip_vs_dest *dest = cp->dest;
+ struct rt6_info *ort; /* Original route */
+ struct dst_entry *dst;
+ int local;
if (dest) {
spin_lock(&dest->dst_lock);
- rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
+ rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
if (!rt) {
- struct flowi fl = {
- .oif = 0,
- .nl_u = {
- .ip6_u = {
- .daddr = dest->addr.in6,
- .saddr = {
- .s6_addr32 =
- { 0, 0, 0, 0 },
- },
- },
- },
- };
+ u32 cookie;
- rt = (struct rt6_info *)ip6_route_output(&init_net,
- NULL, &fl);
- if (!rt) {
+ dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
+ &dest->dst_saddr,
+ do_xfrm);
+ if (!dst) {
spin_unlock(&dest->dst_lock);
- IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
- &dest->addr.in6);
return NULL;
}
- __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst));
- IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
- &dest->addr.in6,
+ rt = (struct rt6_info *) dst;
+ cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+ __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
+ IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
+ &dest->addr.in6, &dest->dst_saddr,
atomic_read(&rt->dst.__refcnt));
}
+ if (ret_saddr)
+ ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
spin_unlock(&dest->dst_lock);
} else {
- struct flowi fl = {
- .oif = 0,
- .nl_u = {
- .ip6_u = {
- .daddr = cp->daddr.in6,
- .saddr = {
- .s6_addr32 = { 0, 0, 0, 0 },
- },
- },
- },
- };
-
- rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
- if (!rt) {
- IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
- &cp->daddr.in6);
+ dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
+ if (!dst)
return NULL;
- }
+ rt = (struct rt6_info *) dst;
+ }
+
+ local = __ip_vs_is_local_route6(rt);
+ if (!((local ? 1 : 2) & rt_mode)) {
+ IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n",
+ local ? "local":"non-local", daddr);
+ dst_release(&rt->dst);
+ return NULL;
+ }
+ if (local && !(rt_mode & 4) &&
+ !((ort = (struct rt6_info *) skb_dst(skb)) &&
+ __ip_vs_is_local_route6(ort))) {
+ IP_VS_DBG_RL("Redirect from non-local address %pI6 to local "
+ "requires NAT method, dest: %pI6\n",
+ &ipv6_hdr(skb)->daddr, daddr);
+ dst_release(&rt->dst);
+ return NULL;
+ }
+ if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
+ IPV6_ADDR_LOOPBACK)) {
+ IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 "
+ "to non-local address, dest: %pI6\n",
+ &ipv6_hdr(skb)->saddr, daddr);
+ dst_release(&rt->dst);
+ return NULL;
}
return rt;
@@ -194,12 +334,44 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
dst_release(old_dst);
}
-#define IP_VS_XMIT(pf, skb, rt) \
+#define IP_VS_XMIT_TUNNEL(skb, cp) \
+({ \
+ int __ret = NF_ACCEPT; \
+ \
+ (skb)->ipvs_property = 1; \
+ if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \
+ __ret = ip_vs_confirm_conntrack(skb, cp); \
+ if (__ret == NF_ACCEPT) { \
+ nf_reset(skb); \
+ skb_forward_csum(skb); \
+ } \
+ __ret; \
+})
+
+#define IP_VS_XMIT_NAT(pf, skb, cp, local) \
+do { \
+ (skb)->ipvs_property = 1; \
+ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
+ ip_vs_notrack(skb); \
+ else \
+ ip_vs_update_conntrack(skb, cp, 1); \
+ if (local) \
+ return NF_ACCEPT; \
+ skb_forward_csum(skb); \
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
+ skb_dst(skb)->dev, dst_output); \
+} while (0)
+
+#define IP_VS_XMIT(pf, skb, cp, local) \
do { \
(skb)->ipvs_property = 1; \
+ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
+ ip_vs_notrack(skb); \
+ if (local) \
+ return NF_ACCEPT; \
skb_forward_csum(skb); \
NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
- (rt)->dst.dev, dst_output); \
+ skb_dst(skb)->dev, dst_output); \
} while (0)
@@ -211,7 +383,7 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp)
{
/* we do not touch skb and do not need pskb ptr */
- return NF_ACCEPT;
+ IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
}
@@ -226,24 +398,13 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
{
struct rtable *rt; /* Route to the other host */
struct iphdr *iph = ip_hdr(skb);
- u8 tos = iph->tos;
int mtu;
- struct flowi fl = {
- .oif = 0,
- .nl_u = {
- .ip4_u = {
- .daddr = iph->daddr,
- .saddr = 0,
- .tos = RT_TOS(tos), } },
- };
EnterFunction(10);
- if (ip_route_output_key(&init_net, &rt, &fl)) {
- IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
- __func__, &iph->daddr);
+ if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr,
+ RT_TOS(iph->tos), 2)))
goto tx_error_icmp;
- }
/* MTU checking */
mtu = dst_mtu(&rt->dst);
@@ -271,7 +432,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+ IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
LeaveFunction(10);
return NF_STOLEN;
@@ -292,28 +453,22 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct rt6_info *rt; /* Route to the other host */
struct ipv6hdr *iph = ipv6_hdr(skb);
int mtu;
- struct flowi fl = {
- .oif = 0,
- .nl_u = {
- .ip6_u = {
- .daddr = iph->daddr,
- .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
- };
EnterFunction(10);
- rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
- if (!rt) {
- IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
- __func__, &iph->daddr);
+ if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0, 2)))
goto tx_error_icmp;
- }
/* MTU checking */
mtu = dst_mtu(&rt->dst);
if (skb->len > mtu) {
- dst_release(&rt->dst);
+ if (!skb->dev) {
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ skb->dev = net->loopback_dev;
+ }
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ dst_release(&rt->dst);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
goto tx_error;
}
@@ -335,7 +490,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+ IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
LeaveFunction(10);
return NF_STOLEN;
@@ -349,30 +504,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
}
#endif
-static void
-ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
-{
- struct nf_conn *ct = (struct nf_conn *)skb->nfct;
- struct nf_conntrack_tuple new_tuple;
-
- if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct))
- return;
-
- /*
- * The connection is not yet in the hashtable, so we update it.
- * CIP->VIP will remain the same, so leave the tuple in
- * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the
- * real-server we will see RIP->DIP.
- */
- new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- new_tuple.src.u3 = cp->daddr;
- /*
- * This will also take care of UDP and other protocols.
- */
- new_tuple.src.u.tcp.port = cp->dport;
- nf_conntrack_alter_reply(ct, &new_tuple);
-}
-
/*
* NAT transmitter (only for outside-to-inside nat forwarding)
* Not used for related ICMP
@@ -384,6 +515,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct rtable *rt; /* Route to the other host */
int mtu;
struct iphdr *iph = ip_hdr(skb);
+ int local;
EnterFunction(10);
@@ -397,16 +529,42 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
}
- if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+ if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ RT_TOS(iph->tos), 1|2|4)))
goto tx_error_icmp;
+ local = rt->rt_flags & RTCF_LOCAL;
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
+ "ip_vs_nat_xmit(): "
+ "stopping DNAT to local address");
+ goto tx_error_put;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
+ IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
+ "stopping DNAT to loopback address");
+ goto tx_error_put;
+ }
/* MTU checking */
mtu = dst_mtu(&rt->dst);
if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
- ip_rt_put(rt);
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
- goto tx_error;
+ IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
+ "ip_vs_nat_xmit(): frag needed for");
+ goto tx_error_put;
}
/* copy-on-write the packet before mangling it */
@@ -416,19 +574,28 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
if (skb_cow(skb, rt->dst.dev->hard_header_len))
goto tx_error_put;
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
/* mangle the packet */
if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
- goto tx_error;
+ goto tx_error_put;
ip_hdr(skb)->daddr = cp->daddr.ip;
ip_send_check(ip_hdr(skb));
- IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+ if (!local) {
+ /* drop old route */
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ } else {
+ ip_rt_put(rt);
+ /*
+ * Some IPv4 replies get local address from routes,
+ * not from iph, so while we DNAT after routing
+ * we need this second input/output route.
+ */
+ if (!__ip_vs_reroute_locally(skb))
+ goto tx_error;
+ }
- ip_vs_update_conntrack(skb, cp);
+ IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still
@@ -437,7 +604,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+ IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
LeaveFunction(10);
return NF_STOLEN;
@@ -445,8 +612,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp:
dst_link_failure(skb);
tx_error:
- LeaveFunction(10);
kfree_skb(skb);
+ LeaveFunction(10);
return NF_STOLEN;
tx_error_put:
ip_rt_put(rt);
@@ -460,6 +627,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
{
struct rt6_info *rt; /* Route to the other host */
int mtu;
+ int local;
EnterFunction(10);
@@ -474,18 +642,49 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
}
- rt = __ip_vs_get_out_rt_v6(cp);
- if (!rt)
+ if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ 0, 1|2|4)))
goto tx_error_icmp;
+ local = __ip_vs_is_local_route6(rt);
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
+ "ip_vs_nat_xmit_v6(): "
+ "stopping DNAT to local address");
+ goto tx_error_put;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+ IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
+ "ip_vs_nat_xmit_v6(): "
+ "stopping DNAT to loopback address");
+ goto tx_error_put;
+ }
/* MTU checking */
mtu = dst_mtu(&rt->dst);
if (skb->len > mtu) {
- dst_release(&rt->dst);
+ if (!skb->dev) {
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ skb->dev = net->loopback_dev;
+ }
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+ IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0,
"ip_vs_nat_xmit_v6(): frag needed for");
- goto tx_error;
+ goto tx_error_put;
}
/* copy-on-write the packet before mangling it */
@@ -495,18 +694,21 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
if (skb_cow(skb, rt->dst.dev->hard_header_len))
goto tx_error_put;
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
/* mangle the packet */
if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
goto tx_error;
- ipv6_hdr(skb)->daddr = cp->daddr.in6;
+ ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6);
- IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+ if (!local || !skb->dev) {
+ /* drop the old route when skb is not shared */
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ } else {
+ /* destined to loopback, do we need to change route? */
+ dst_release(&rt->dst);
+ }
- ip_vs_update_conntrack(skb, cp);
+ IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still
@@ -515,7 +717,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+ IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
LeaveFunction(10);
return NF_STOLEN;
@@ -561,30 +763,27 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct iphdr *old_iph = ip_hdr(skb);
u8 tos = old_iph->tos;
__be16 df = old_iph->frag_off;
- sk_buff_data_t old_transport_header = skb->transport_header;
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int mtu;
+ int ret;
EnterFunction(10);
- if (skb->protocol != htons(ETH_P_IP)) {
- IP_VS_DBG_RL("%s(): protocol error, "
- "ETH_P_IP: %d, skb protocol: %d\n",
- __func__, htons(ETH_P_IP), skb->protocol);
- goto tx_error;
- }
-
- if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+ if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ RT_TOS(tos), 1|2)))
goto tx_error_icmp;
+ if (rt->rt_flags & RTCF_LOCAL) {
+ ip_rt_put(rt);
+ IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+ }
tdev = rt->dst.dev;
mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
if (mtu < 68) {
- ip_rt_put(rt);
IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
- goto tx_error;
+ goto tx_error_put;
}
if (skb_dst(skb))
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
@@ -594,9 +793,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
if ((old_iph->frag_off & htons(IP_DF))
&& mtu < ntohs(old_iph->tot_len)) {
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error;
+ goto tx_error_put;
}
/*
@@ -619,7 +817,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
old_iph = ip_hdr(skb);
}
- skb->transport_header = old_transport_header;
+ skb->transport_header = skb->network_header;
/* fix old IP header checksum */
ip_send_check(old_iph);
@@ -649,7 +847,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- ip_local_out(skb);
+ ret = IP_VS_XMIT_TUNNEL(skb, cp);
+ if (ret == NF_ACCEPT)
+ ip_local_out(skb);
+ else if (ret == NF_DROP)
+ kfree_skb(skb);
LeaveFunction(10);
@@ -661,6 +863,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
kfree_skb(skb);
LeaveFunction(10);
return NF_STOLEN;
+tx_error_put:
+ ip_rt_put(rt);
+ goto tx_error;
}
#ifdef CONFIG_IP_VS_IPV6
@@ -669,43 +874,44 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp)
{
struct rt6_info *rt; /* Route to the other host */
+ struct in6_addr saddr; /* Source for tunnel */
struct net_device *tdev; /* Device to other host */
struct ipv6hdr *old_iph = ipv6_hdr(skb);
- sk_buff_data_t old_transport_header = skb->transport_header;
struct ipv6hdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int mtu;
+ int ret;
EnterFunction(10);
- if (skb->protocol != htons(ETH_P_IPV6)) {
- IP_VS_DBG_RL("%s(): protocol error, "
- "ETH_P_IPV6: %d, skb protocol: %d\n",
- __func__, htons(ETH_P_IPV6), skb->protocol);
- goto tx_error;
- }
-
- rt = __ip_vs_get_out_rt_v6(cp);
- if (!rt)
+ if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
+ &saddr, 1, 1|2)))
goto tx_error_icmp;
+ if (__ip_vs_is_local_route6(rt)) {
+ dst_release(&rt->dst);
+ IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+ }
tdev = rt->dst.dev;
mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
- /* TODO IPv6: do we need this check in IPv6? */
- if (mtu < 1280) {
- dst_release(&rt->dst);
- IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
- goto tx_error;
+ if (mtu < IPV6_MIN_MTU) {
+ IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+ IPV6_MIN_MTU);
+ goto tx_error_put;
}
if (skb_dst(skb))
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
+ if (!skb->dev) {
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ skb->dev = net->loopback_dev;
+ }
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- dst_release(&rt->dst);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error;
+ goto tx_error_put;
}
/*
@@ -728,7 +934,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
old_iph = ipv6_hdr(skb);
}
- skb->transport_header = old_transport_header;
+ skb->transport_header = skb->network_header;
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
@@ -748,14 +954,18 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
iph->priority = old_iph->priority;
memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
- iph->daddr = rt->rt6i_dst.addr;
- iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */
+ ipv6_addr_copy(&iph->daddr, &cp->daddr.in6);
+ ipv6_addr_copy(&iph->saddr, &saddr);
iph->hop_limit = old_iph->hop_limit;
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- ip6_local_out(skb);
+ ret = IP_VS_XMIT_TUNNEL(skb, cp);
+ if (ret == NF_ACCEPT)
+ ip6_local_out(skb);
+ else if (ret == NF_DROP)
+ kfree_skb(skb);
LeaveFunction(10);
@@ -767,6 +977,9 @@ tx_error:
kfree_skb(skb);
LeaveFunction(10);
return NF_STOLEN;
+tx_error_put:
+ dst_release(&rt->dst);
+ goto tx_error;
}
#endif
@@ -785,8 +998,13 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+ if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ RT_TOS(iph->tos), 1|2)))
goto tx_error_icmp;
+ if (rt->rt_flags & RTCF_LOCAL) {
+ ip_rt_put(rt);
+ IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+ }
/* MTU checking */
mtu = dst_mtu(&rt->dst);
@@ -814,7 +1032,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+ IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
LeaveFunction(10);
return NF_STOLEN;
@@ -837,13 +1055,22 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
EnterFunction(10);
- rt = __ip_vs_get_out_rt_v6(cp);
- if (!rt)
+ if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ 0, 1|2)))
goto tx_error_icmp;
+ if (__ip_vs_is_local_route6(rt)) {
+ dst_release(&rt->dst);
+ IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+ }
/* MTU checking */
mtu = dst_mtu(&rt->dst);
if (skb->len > mtu) {
+ if (!skb->dev) {
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ skb->dev = net->loopback_dev;
+ }
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
dst_release(&rt->dst);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
@@ -867,7 +1094,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+ IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
LeaveFunction(10);
return NF_STOLEN;
@@ -893,6 +1120,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct rtable *rt; /* Route to the other host */
int mtu;
int rc;
+ int local;
EnterFunction(10);
@@ -913,16 +1141,43 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
* mangle and send the packet here (only for VS/NAT)
*/
- if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
+ if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ RT_TOS(ip_hdr(skb)->tos), 1|2|4)))
goto tx_error_icmp;
+ local = rt->rt_flags & RTCF_LOCAL;
+
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG(10, "%s(): "
+ "stopping DNAT to local address %pI4\n",
+ __func__, &cp->daddr.ip);
+ goto tx_error_put;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
+ IP_VS_DBG(1, "%s(): "
+ "stopping DNAT to loopback %pI4\n",
+ __func__, &cp->daddr.ip);
+ goto tx_error_put;
+ }
/* MTU checking */
mtu = dst_mtu(&rt->dst);
if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
- ip_rt_put(rt);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error;
+ goto tx_error_put;
}
/* copy-on-write the packet before mangling it */
@@ -932,16 +1187,27 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
if (skb_cow(skb, rt->dst.dev->hard_header_len))
goto tx_error_put;
- /* drop the old route when skb is not shared */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
ip_vs_nat_icmp(skb, pp, cp, 0);
+ if (!local) {
+ /* drop the old route when skb is not shared */
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ } else {
+ ip_rt_put(rt);
+ /*
+ * Some IPv4 replies get local address from routes,
+ * not from iph, so while we DNAT after routing
+ * we need this second input/output route.
+ */
+ if (!__ip_vs_reroute_locally(skb))
+ goto tx_error;
+ }
+
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+ IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
rc = NF_STOLEN;
goto out;
@@ -967,6 +1233,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct rt6_info *rt; /* Route to the other host */
int mtu;
int rc;
+ int local;
EnterFunction(10);
@@ -987,17 +1254,49 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
* mangle and send the packet here (only for VS/NAT)
*/
- rt = __ip_vs_get_out_rt_v6(cp);
- if (!rt)
+ if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ 0, 1|2|4)))
goto tx_error_icmp;
+ local = __ip_vs_is_local_route6(rt);
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG(10, "%s(): "
+ "stopping DNAT to local address %pI6\n",
+ __func__, &cp->daddr.in6);
+ goto tx_error_put;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+ IP_VS_DBG(1, "%s(): "
+ "stopping DNAT to loopback %pI6\n",
+ __func__, &cp->daddr.in6);
+ goto tx_error_put;
+ }
+
/* MTU checking */
mtu = dst_mtu(&rt->dst);
if (skb->len > mtu) {
- dst_release(&rt->dst);
+ if (!skb->dev) {
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ skb->dev = net->loopback_dev;
+ }
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error;
+ goto tx_error_put;
}
/* copy-on-write the packet before mangling it */
@@ -1007,16 +1306,21 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
if (skb_cow(skb, rt->dst.dev->hard_header_len))
goto tx_error_put;
- /* drop the old route when skb is not shared */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
ip_vs_nat_icmp_v6(skb, pp, cp, 0);
+ if (!local || !skb->dev) {
+ /* drop the old route when skb is not shared */
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ } else {
+ /* destined to loopback, do we need to change route? */
+ dst_release(&rt->dst);
+ }
+
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+ IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
rc = NF_STOLEN;
goto out;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index df3eedb142f..1eacf8d9966 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -65,32 +65,42 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);
DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
-static int nf_conntrack_hash_rnd_initted;
-static unsigned int nf_conntrack_hash_rnd;
+static unsigned int nf_conntrack_hash_rnd __read_mostly;
-static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
- u16 zone, unsigned int size, unsigned int rnd)
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)
{
unsigned int n;
- u_int32_t h;
/* The direction must be ignored, so we hash everything up to the
* destination ports (which is a multiple of 4) and treat the last
* three bytes manually.
*/
n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
- h = jhash2((u32 *)tuple, n,
- zone ^ rnd ^ (((__force __u16)tuple->dst.u.all << 16) |
- tuple->dst.protonum));
+ return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^
+ (((__force __u16)tuple->dst.u.all << 16) |
+ tuple->dst.protonum));
+}
+
+static u32 __hash_bucket(u32 hash, unsigned int size)
+{
+ return ((u64)hash * size) >> 32;
+}
+
+static u32 hash_bucket(u32 hash, const struct net *net)
+{
+ return __hash_bucket(hash, net->ct.htable_size);
+}
- return ((u64)h * size) >> 32;
+static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
+ u16 zone, unsigned int size)
+{
+ return __hash_bucket(hash_conntrack_raw(tuple, zone), size);
}
static inline u_int32_t hash_conntrack(const struct net *net, u16 zone,
const struct nf_conntrack_tuple *tuple)
{
- return __hash_conntrack(tuple, zone, net->ct.htable_size,
- nf_conntrack_hash_rnd);
+ return __hash_conntrack(tuple, zone, net->ct.htable_size);
}
bool
@@ -292,20 +302,20 @@ static void death_by_timeout(unsigned long ul_conntrack)
* OR
* - Caller must lock nf_conntrack_lock before calling this function
*/
-struct nf_conntrack_tuple_hash *
-__nf_conntrack_find(struct net *net, u16 zone,
- const struct nf_conntrack_tuple *tuple)
+static struct nf_conntrack_tuple_hash *
+____nf_conntrack_find(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
- unsigned int hash = hash_conntrack(net, zone, tuple);
+ unsigned int bucket = hash_bucket(hash, net);
/* Disable BHs the entire time since we normally need to disable them
* at least once for the stats anyway.
*/
local_bh_disable();
begin:
- hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
+ hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
if (nf_ct_tuple_equal(tuple, &h->tuple) &&
nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) {
NF_CT_STAT_INC(net, found);
@@ -319,7 +329,7 @@ begin:
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
- if (get_nulls_value(n) != hash) {
+ if (get_nulls_value(n) != bucket) {
NF_CT_STAT_INC(net, search_restart);
goto begin;
}
@@ -327,19 +337,27 @@ begin:
return NULL;
}
+
+struct nf_conntrack_tuple_hash *
+__nf_conntrack_find(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return ____nf_conntrack_find(net, zone, tuple,
+ hash_conntrack_raw(tuple, zone));
+}
EXPORT_SYMBOL_GPL(__nf_conntrack_find);
/* Find a connection corresponding to a tuple. */
-struct nf_conntrack_tuple_hash *
-nf_conntrack_find_get(struct net *net, u16 zone,
- const struct nf_conntrack_tuple *tuple)
+static struct nf_conntrack_tuple_hash *
+__nf_conntrack_find_get(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
rcu_read_lock();
begin:
- h = __nf_conntrack_find(net, zone, tuple);
+ h = ____nf_conntrack_find(net, zone, tuple, hash);
if (h) {
ct = nf_ct_tuplehash_to_ctrack(h);
if (unlikely(nf_ct_is_dying(ct) ||
@@ -357,6 +375,14 @@ begin:
return h;
}
+
+struct nf_conntrack_tuple_hash *
+nf_conntrack_find_get(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return __nf_conntrack_find_get(net, zone, tuple,
+ hash_conntrack_raw(tuple, zone));
+}
EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
static void __nf_conntrack_hash_insert(struct nf_conn *ct,
@@ -409,8 +435,11 @@ __nf_conntrack_confirm(struct sk_buff *skb)
return NF_ACCEPT;
zone = nf_ct_zone(ct);
- hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ /* reuse the hash saved before */
+ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
+ hash = hash_bucket(hash, net);
+ repl_hash = hash_conntrack(net, zone,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
/* We're not in hash table, and we refuse to set up related
connections for unconfirmed conns. But packet copies and
@@ -567,17 +596,29 @@ static noinline int early_drop(struct net *net, unsigned int hash)
return dropped;
}
-struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
- const struct nf_conntrack_tuple *orig,
- const struct nf_conntrack_tuple *repl,
- gfp_t gfp)
+static struct nf_conn *
+__nf_conntrack_alloc(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_tuple *repl,
+ gfp_t gfp, u32 hash)
{
struct nf_conn *ct;
- if (unlikely(!nf_conntrack_hash_rnd_initted)) {
- get_random_bytes(&nf_conntrack_hash_rnd,
- sizeof(nf_conntrack_hash_rnd));
- nf_conntrack_hash_rnd_initted = 1;
+ if (unlikely(!nf_conntrack_hash_rnd)) {
+ unsigned int rand;
+
+ /*
+ * Why not initialize nf_conntrack_rnd in a "init()" function ?
+ * Because there isn't enough entropy when system initializing,
+ * and we initialize it as late as possible.
+ */
+ do {
+ get_random_bytes(&rand, sizeof(rand));
+ } while (!rand);
+ cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
+
+ /* recompute the hash as nf_conntrack_hash_rnd is initialized */
+ hash = hash_conntrack_raw(orig, zone);
}
/* We don't want any race condition at early drop stage */
@@ -585,8 +626,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
if (nf_conntrack_max &&
unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
- unsigned int hash = hash_conntrack(net, zone, orig);
- if (!early_drop(net, hash)) {
+ if (!early_drop(net, hash_bucket(hash, net))) {
atomic_dec(&net->ct.count);
if (net_ratelimit())
printk(KERN_WARNING
@@ -616,7 +656,8 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
- ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL;
+ /* save hash for reusing when confirming */
+ *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
/* Don't set timer yet: wait for confirmation */
setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
write_pnet(&ct->ct_net, net);
@@ -643,6 +684,14 @@ out_free:
return ERR_PTR(-ENOMEM);
#endif
}
+
+struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_tuple *repl,
+ gfp_t gfp)
+{
+ return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
+}
EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
void nf_conntrack_free(struct nf_conn *ct)
@@ -664,7 +713,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
struct nf_conntrack_l3proto *l3proto,
struct nf_conntrack_l4proto *l4proto,
struct sk_buff *skb,
- unsigned int dataoff)
+ unsigned int dataoff, u32 hash)
{
struct nf_conn *ct;
struct nf_conn_help *help;
@@ -678,7 +727,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
return NULL;
}
- ct = nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC);
+ ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
+ hash);
if (IS_ERR(ct)) {
pr_debug("Can't allocate conntrack.\n");
return (struct nf_conntrack_tuple_hash *)ct;
@@ -755,6 +805,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+ u32 hash;
if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
dataoff, l3num, protonum, &tuple, l3proto,
@@ -764,10 +815,11 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
}
/* look for tuple match */
- h = nf_conntrack_find_get(net, zone, &tuple);
+ hash = hash_conntrack_raw(&tuple, zone);
+ h = __nf_conntrack_find_get(net, zone, &tuple, hash);
if (!h) {
h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
- skb, dataoff);
+ skb, dataoff, hash);
if (!h)
return NULL;
if (IS_ERR(h))
@@ -1307,8 +1359,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
ct = nf_ct_tuplehash_to_ctrack(h);
hlist_nulls_del_rcu(&h->hnnode);
bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
- hashsize,
- nf_conntrack_hash_rnd);
+ hashsize);
hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
}
}
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index cdcc7649476..5702de35e2b 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -26,10 +26,10 @@
static DEFINE_MUTEX(nf_ct_ecache_mutex);
-struct nf_ct_event_notifier *nf_conntrack_event_cb __read_mostly;
+struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_event_cb);
-struct nf_exp_event_notifier *nf_expect_event_cb __read_mostly;
+struct nf_exp_event_notifier __rcu *nf_expect_event_cb __read_mostly;
EXPORT_SYMBOL_GPL(nf_expect_event_cb);
/* deliver cached events and clear cache entry - must be called with locally
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index acb29ccaa41..46e8966912b 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -38,25 +38,30 @@ static int nf_ct_expect_hash_rnd_initted __read_mostly;
static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
+static HLIST_HEAD(nf_ct_userspace_expect_list);
+
/* nf_conntrack_expect helper functions */
-void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
+void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
+ u32 pid, int report)
{
struct nf_conn_help *master_help = nfct_help(exp->master);
struct net *net = nf_ct_exp_net(exp);
- NF_CT_ASSERT(master_help);
NF_CT_ASSERT(!timer_pending(&exp->timeout));
hlist_del_rcu(&exp->hnode);
net->ct.expect_count--;
hlist_del(&exp->lnode);
- master_help->expecting[exp->class]--;
+ if (!(exp->flags & NF_CT_EXPECT_USERSPACE))
+ master_help->expecting[exp->class]--;
+
+ nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report);
nf_ct_expect_put(exp);
NF_CT_STAT_INC(net, expect_delete);
}
-EXPORT_SYMBOL_GPL(nf_ct_unlink_expect);
+EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report);
static void nf_ct_expectation_timed_out(unsigned long ul_expect)
{
@@ -320,16 +325,21 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
atomic_inc(&exp->use);
- hlist_add_head(&exp->lnode, &master_help->expectations);
- master_help->expecting[exp->class]++;
+ if (master_help) {
+ hlist_add_head(&exp->lnode, &master_help->expectations);
+ master_help->expecting[exp->class]++;
+ } else if (exp->flags & NF_CT_EXPECT_USERSPACE)
+ hlist_add_head(&exp->lnode, &nf_ct_userspace_expect_list);
hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
net->ct.expect_count++;
setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
(unsigned long)exp);
- p = &master_help->helper->expect_policy[exp->class];
- exp->timeout.expires = jiffies + p->timeout * HZ;
+ if (master_help) {
+ p = &master_help->helper->expect_policy[exp->class];
+ exp->timeout.expires = jiffies + p->timeout * HZ;
+ }
add_timer(&exp->timeout);
atomic_inc(&exp->use);
@@ -380,7 +390,9 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
unsigned int h;
int ret = 1;
- if (!master_help->helper) {
+ /* Don't allow expectations created from kernel-space with no helper */
+ if (!(expect->flags & NF_CT_EXPECT_USERSPACE) &&
+ (!master_help || (master_help && !master_help->helper))) {
ret = -ESHUTDOWN;
goto out;
}
@@ -398,13 +410,16 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
}
}
/* Will be over limit? */
- p = &master_help->helper->expect_policy[expect->class];
- if (p->max_expected &&
- master_help->expecting[expect->class] >= p->max_expected) {
- evict_oldest_expect(master, expect);
- if (master_help->expecting[expect->class] >= p->max_expected) {
- ret = -EMFILE;
- goto out;
+ if (master_help) {
+ p = &master_help->helper->expect_policy[expect->class];
+ if (p->max_expected &&
+ master_help->expecting[expect->class] >= p->max_expected) {
+ evict_oldest_expect(master, expect);
+ if (master_help->expecting[expect->class]
+ >= p->max_expected) {
+ ret = -EMFILE;
+ goto out;
+ }
}
}
@@ -439,6 +454,21 @@ out:
}
EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
+void nf_ct_remove_userspace_expectations(void)
+{
+ struct nf_conntrack_expect *exp;
+ struct hlist_node *n, *next;
+
+ hlist_for_each_entry_safe(exp, n, next,
+ &nf_ct_userspace_expect_list, lnode) {
+ if (del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect(exp);
+ nf_ct_expect_put(exp);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(nf_ct_remove_userspace_expectations);
+
#ifdef CONFIG_PROC_FS
struct ct_expect_iter_state {
struct seq_net_private p;
@@ -529,8 +559,12 @@ static int exp_seq_show(struct seq_file *s, void *v)
seq_printf(s, "PERMANENT");
delim = ",";
}
- if (expect->flags & NF_CT_EXPECT_INACTIVE)
+ if (expect->flags & NF_CT_EXPECT_INACTIVE) {
seq_printf(s, "%sINACTIVE", delim);
+ delim = ",";
+ }
+ if (expect->flags & NF_CT_EXPECT_USERSPACE)
+ seq_printf(s, "%sUSERSPACE", delim);
helper = rcu_dereference(nfct_help(expect->master)->helper);
if (helper) {
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 7dcf7a40419..bd82450c193 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -16,7 +16,7 @@
#include <linux/skbuff.h>
#include <net/netfilter/nf_conntrack_extend.h>
-static struct nf_ct_ext_type *nf_ct_ext_types[NF_CT_EXT_NUM];
+static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
static DEFINE_MUTEX(nf_ct_ext_type_mutex);
void __nf_ct_ext_destroy(struct nf_conn *ct)
@@ -48,15 +48,17 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp)
{
unsigned int off, len;
struct nf_ct_ext_type *t;
+ size_t alloc_size;
rcu_read_lock();
t = rcu_dereference(nf_ct_ext_types[id]);
BUG_ON(t == NULL);
off = ALIGN(sizeof(struct nf_ct_ext), t->align);
len = off + t->len;
+ alloc_size = t->alloc_size;
rcu_read_unlock();
- *ext = kzalloc(t->alloc_size, gfp);
+ *ext = kzalloc(alloc_size, gfp);
if (!*ext)
return NULL;
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 5bae1cd15ee..b729ace1dcc 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -22,6 +22,7 @@
#include <linux/rculist_nulls.h>
#include <linux/types.h>
#include <linux/timer.h>
+#include <linux/security.h>
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/netlink.h>
@@ -245,16 +246,31 @@ nla_put_failure:
#ifdef CONFIG_NF_CONNTRACK_SECMARK
static inline int
-ctnetlink_dump_secmark(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
{
- NLA_PUT_BE32(skb, CTA_SECMARK, htonl(ct->secmark));
- return 0;
+ struct nlattr *nest_secctx;
+ int len, ret;
+ char *secctx;
+
+ ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+ if (ret)
+ return ret;
+
+ ret = -1;
+ nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED);
+ if (!nest_secctx)
+ goto nla_put_failure;
+ NLA_PUT_STRING(skb, CTA_SECCTX_NAME, secctx);
+ nla_nest_end(skb, nest_secctx);
+
+ ret = 0;
nla_put_failure:
- return -1;
+ security_release_secctx(secctx, len);
+ return ret;
}
#else
-#define ctnetlink_dump_secmark(a, b) (0)
+#define ctnetlink_dump_secctx(a, b) (0)
#endif
#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
@@ -391,7 +407,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
ctnetlink_dump_protoinfo(skb, ct) < 0 ||
ctnetlink_dump_helpinfo(skb, ct) < 0 ||
ctnetlink_dump_mark(skb, ct) < 0 ||
- ctnetlink_dump_secmark(skb, ct) < 0 ||
+ ctnetlink_dump_secctx(skb, ct) < 0 ||
ctnetlink_dump_id(skb, ct) < 0 ||
ctnetlink_dump_use(skb, ct) < 0 ||
ctnetlink_dump_master(skb, ct) < 0 ||
@@ -437,6 +453,17 @@ ctnetlink_counters_size(const struct nf_conn *ct)
;
}
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ctnetlink_nlmsg_secctx_size(const struct nf_conn *ct)
+{
+ int len;
+
+ security_secid_to_secctx(ct->secmark, NULL, &len);
+
+ return sizeof(char) * len;
+}
+#endif
+
static inline size_t
ctnetlink_nlmsg_size(const struct nf_conn *ct)
{
@@ -453,7 +480,8 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
+ nla_total_size(0) /* CTA_HELP */
+ nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
#ifdef CONFIG_NF_CONNTRACK_SECMARK
- + nla_total_size(sizeof(u_int32_t)) /* CTA_SECMARK */
+ + nla_total_size(0) /* CTA_SECCTX */
+ + nla_total_size(ctnetlink_nlmsg_secctx_size(ct)) /* CTA_SECCTX_NAME */
#endif
#ifdef CONFIG_NF_NAT_NEEDED
+ 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
@@ -556,7 +584,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
#ifdef CONFIG_NF_CONNTRACK_SECMARK
if ((events & (1 << IPCT_SECMARK) || ct->secmark)
- && ctnetlink_dump_secmark(skb, ct) < 0)
+ && ctnetlink_dump_secctx(skb, ct) < 0)
goto nla_put_failure;
#endif
@@ -1560,8 +1588,8 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
const struct nf_conntrack_expect *exp)
{
struct nf_conn *master = exp->master;
- struct nf_conntrack_helper *helper;
long timeout = (exp->timeout.expires - jiffies) / HZ;
+ struct nf_conn_help *help;
if (timeout < 0)
timeout = 0;
@@ -1577,9 +1605,15 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout));
NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp));
- helper = rcu_dereference(nfct_help(master)->helper);
- if (helper)
- NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
+ NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags));
+ help = nfct_help(master);
+ if (help) {
+ struct nf_conntrack_helper *helper;
+
+ helper = rcu_dereference(help->helper);
+ if (helper)
+ NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
+ }
return 0;
@@ -1626,17 +1660,20 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
struct sk_buff *skb;
- unsigned int type;
+ unsigned int type, group;
int flags = 0;
- if (events & (1 << IPEXP_NEW)) {
+ if (events & (1 << IPEXP_DESTROY)) {
+ type = IPCTNL_MSG_EXP_DELETE;
+ group = NFNLGRP_CONNTRACK_EXP_DESTROY;
+ } else if (events & (1 << IPEXP_NEW)) {
type = IPCTNL_MSG_EXP_NEW;
flags = NLM_F_CREATE|NLM_F_EXCL;
+ group = NFNLGRP_CONNTRACK_EXP_NEW;
} else
return 0;
- if (!item->report &&
- !nfnetlink_has_listeners(net, NFNLGRP_CONNTRACK_EXP_NEW))
+ if (!item->report && !nfnetlink_has_listeners(net, group))
return 0;
skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
@@ -1659,8 +1696,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
rcu_read_unlock();
nlmsg_end(skb, nlh);
- nfnetlink_send(skb, net, item->pid, NFNLGRP_CONNTRACK_EXP_NEW,
- item->report, GFP_ATOMIC);
+ nfnetlink_send(skb, net, item->pid, group, item->report, GFP_ATOMIC);
return 0;
nla_put_failure:
@@ -1733,6 +1769,8 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
[CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 },
[CTA_EXPECT_ID] = { .type = NLA_U32 },
[CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING },
+ [CTA_EXPECT_ZONE] = { .type = NLA_U16 },
+ [CTA_EXPECT_FLAGS] = { .type = NLA_U32 },
};
static int
@@ -1841,7 +1879,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
}
/* after list removal, usage count == 1 */
- nf_ct_unexpect_related(exp);
+ spin_lock_bh(&nf_conntrack_lock);
+ if (del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).pid,
+ nlmsg_report(nlh));
+ nf_ct_expect_put(exp);
+ }
+ spin_unlock_bh(&nf_conntrack_lock);
/* have to put what we 'get' above.
* after this line usage count == 0 */
nf_ct_expect_put(exp);
@@ -1858,7 +1902,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
m_help = nfct_help(exp->master);
if (!strcmp(m_help->helper->name, name) &&
del_timer(&exp->timeout)) {
- nf_ct_unlink_expect(exp);
+ nf_ct_unlink_expect_report(exp,
+ NETLINK_CB(skb).pid,
+ nlmsg_report(nlh));
nf_ct_expect_put(exp);
}
}
@@ -1872,7 +1918,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
&net->ct.expect_hash[i],
hnode) {
if (del_timer(&exp->timeout)) {
- nf_ct_unlink_expect(exp);
+ nf_ct_unlink_expect_report(exp,
+ NETLINK_CB(skb).pid,
+ nlmsg_report(nlh));
nf_ct_expect_put(exp);
}
}
@@ -1918,23 +1966,35 @@ ctnetlink_create_expect(struct net *net, u16 zone,
if (!h)
return -ENOENT;
ct = nf_ct_tuplehash_to_ctrack(h);
- help = nfct_help(ct);
-
- if (!help || !help->helper) {
- /* such conntrack hasn't got any helper, abort */
- err = -EOPNOTSUPP;
- goto out;
- }
-
exp = nf_ct_expect_alloc(ct);
if (!exp) {
err = -ENOMEM;
goto out;
}
+ help = nfct_help(ct);
+ if (!help) {
+ if (!cda[CTA_EXPECT_TIMEOUT]) {
+ err = -EINVAL;
+ goto out;
+ }
+ exp->timeout.expires =
+ jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
+
+ exp->flags = NF_CT_EXPECT_USERSPACE;
+ if (cda[CTA_EXPECT_FLAGS]) {
+ exp->flags |=
+ ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+ }
+ } else {
+ if (cda[CTA_EXPECT_FLAGS]) {
+ exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+ exp->flags &= ~NF_CT_EXPECT_USERSPACE;
+ } else
+ exp->flags = 0;
+ }
exp->class = 0;
exp->expectfn = NULL;
- exp->flags = 0;
exp->master = ct;
exp->helper = NULL;
memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple));
@@ -2102,6 +2162,7 @@ static void __exit ctnetlink_exit(void)
{
pr_info("ctnetlink: unregistering from nfnetlink.\n");
+ nf_ct_remove_userspace_expectations();
#ifdef CONFIG_NF_CONNTRACK_EVENTS
nf_ct_expect_unregister_notifier(&ctnl_notifier_exp);
nf_conntrack_unregister_notifier(&ctnl_notifier);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 5886ba1d52a..ed6d9295802 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -28,8 +28,8 @@
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_core.h>
-static struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly;
-struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly;
+static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly;
+struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_l3protos);
static DEFINE_MUTEX(nf_ct_proto_mutex);
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 53d892210a0..bcf47eb518e 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -130,6 +130,44 @@ static int digits_len(const struct nf_conn *ct, const char *dptr,
return len;
}
+static int iswordc(const char c)
+{
+ if (isalnum(c) || c == '!' || c == '"' || c == '%' ||
+ (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' ||
+ c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' ||
+ c == '{' || c == '}' || c == '~')
+ return 1;
+ return 0;
+}
+
+static int word_len(const char *dptr, const char *limit)
+{
+ int len = 0;
+ while (dptr < limit && iswordc(*dptr)) {
+ dptr++;
+ len++;
+ }
+ return len;
+}
+
+static int callid_len(const struct nf_conn *ct, const char *dptr,
+ const char *limit, int *shift)
+{
+ int len, domain_len;
+
+ len = word_len(dptr, limit);
+ dptr += len;
+ if (!len || dptr == limit || *dptr != '@')
+ return len;
+ dptr++;
+ len++;
+
+ domain_len = word_len(dptr, limit);
+ if (!domain_len)
+ return 0;
+ return len + domain_len;
+}
+
/* get media type + port length */
static int media_len(const struct nf_conn *ct, const char *dptr,
const char *limit, int *shift)
@@ -152,6 +190,9 @@ static int parse_addr(const struct nf_conn *ct, const char *cp,
const char *end;
int ret = 0;
+ if (!ct)
+ return 0;
+
memset(addr, 0, sizeof(*addr));
switch (nf_ct_l3num(ct)) {
case AF_INET:
@@ -296,6 +337,7 @@ static const struct sip_header ct_sip_hdrs[] = {
[SIP_HDR_VIA_TCP] = SIP_HDR("Via", "v", "TCP ", epaddr_len),
[SIP_HDR_EXPIRES] = SIP_HDR("Expires", NULL, NULL, digits_len),
[SIP_HDR_CONTENT_LENGTH] = SIP_HDR("Content-Length", "l", NULL, digits_len),
+ [SIP_HDR_CALL_ID] = SIP_HDR("Call-Id", "i", NULL, callid_len),
};
static const char *sip_follow_continuation(const char *dptr, const char *limit)
@@ -1376,7 +1418,7 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,
unsigned int msglen, origlen;
const char *dptr, *end;
s16 diff, tdiff = 0;
- int ret;
+ int ret = NF_ACCEPT;
typeof(nf_nat_sip_seq_adjust_hook) nf_nat_sip_seq_adjust;
if (ctinfo != IP_CT_ESTABLISHED &&
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index eb973fcd67a..0fb65705b44 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -15,6 +15,7 @@
#include <linux/seq_file.h>
#include <linux/percpu.h>
#include <linux/netdevice.h>
+#include <linux/security.h>
#include <net/net_namespace.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
@@ -108,6 +109,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
rcu_read_unlock();
}
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+ int ret;
+ u32 len;
+ char *secctx;
+
+ ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+ if (ret)
+ return ret;
+
+ ret = seq_printf(s, "secctx=%s ", secctx);
+
+ security_release_secctx(secctx, len);
+ return ret;
+}
+#else
+static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+ return 0;
+}
+#endif
+
/* return 0 on success, 1 in case of error */
static int ct_seq_show(struct seq_file *s, void *v)
{
@@ -168,10 +192,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
goto release;
#endif
-#ifdef CONFIG_NF_CONNTRACK_SECMARK
- if (seq_printf(s, "secmark=%u ", ct->secmark))
+ if (ct_show_secctx(s, ct))
goto release;
-#endif
#ifdef CONFIG_NF_CONNTRACK_ZONES
if (seq_printf(s, "zone=%u ", nf_ct_zone(ct)))
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 7df37fd786b..b07393eab88 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -16,7 +16,7 @@
#define NF_LOG_PREFIXLEN 128
#define NFLOGGER_NAME_LEN 64
-static const struct nf_logger *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
+static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly;
static DEFINE_MUTEX(nf_log_mutex);
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 78b3cf9c519..74aebed5bd2 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -18,7 +18,7 @@
* long term mutex. The handler must provide an an outfn() to accept packets
* for queueing and must reinject all packets it receives, no matter what.
*/
-static const struct nf_queue_handler *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
+static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
static DEFINE_MUTEX(queue_handler_mutex);
diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c
index 5490fc37c92..4d87befb04c 100644
--- a/net/netfilter/nf_tproxy_core.c
+++ b/net/netfilter/nf_tproxy_core.c
@@ -18,41 +18,6 @@
#include <net/udp.h>
#include <net/netfilter/nf_tproxy_core.h>
-struct sock *
-nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
- const __be32 saddr, const __be32 daddr,
- const __be16 sport, const __be16 dport,
- const struct net_device *in, bool listening_only)
-{
- struct sock *sk;
-
- /* look up socket */
- switch (protocol) {
- case IPPROTO_TCP:
- if (listening_only)
- sk = __inet_lookup_listener(net, &tcp_hashinfo,
- daddr, ntohs(dport),
- in->ifindex);
- else
- sk = __inet_lookup(net, &tcp_hashinfo,
- saddr, sport, daddr, dport,
- in->ifindex);
- break;
- case IPPROTO_UDP:
- sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
- in->ifindex);
- break;
- default:
- WARN_ON(1);
- sk = NULL;
- }
-
- pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, listener only: %d, sock %p\n",
- protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), listening_only, sk);
-
- return sk;
-}
-EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4);
static void
nf_tproxy_destructor(struct sk_buff *skb)
@@ -70,7 +35,11 @@ nf_tproxy_destructor(struct sk_buff *skb)
int
nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
{
- if (inet_sk(sk)->transparent) {
+ bool transparent = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_transparent :
+ inet_sk(sk)->transparent;
+
+ if (transparent) {
skb_orphan(skb);
skb->sk = sk;
skb->destructor = nf_tproxy_destructor;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index e34622fa000..80463507420 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -116,10 +116,8 @@ EXPORT_SYMBOL(xt_register_targets);
void
xt_unregister_targets(struct xt_target *target, unsigned int n)
{
- unsigned int i;
-
- for (i = 0; i < n; i++)
- xt_unregister_target(&target[i]);
+ while (n-- > 0)
+ xt_unregister_target(&target[n]);
}
EXPORT_SYMBOL(xt_unregister_targets);
@@ -174,10 +172,8 @@ EXPORT_SYMBOL(xt_register_matches);
void
xt_unregister_matches(struct xt_match *match, unsigned int n)
{
- unsigned int i;
-
- for (i = 0; i < n; i++)
- xt_unregister_match(&match[i]);
+ while (n-- > 0)
+ xt_unregister_match(&match[n]);
}
EXPORT_SYMBOL(xt_unregister_matches);
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 0cb6053f02f..782e51986a6 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -9,7 +9,6 @@
#include <linux/module.h>
#include <linux/gfp.h>
#include <linux/skbuff.h>
-#include <linux/selinux.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/netfilter/x_tables.h>
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 23b2d6c486b..9faf5e050b7 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -14,8 +14,8 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
+#include <linux/security.h>
#include <linux/skbuff.h>
-#include <linux/selinux.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_SECMARK.h>
@@ -39,9 +39,8 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
switch (mode) {
case SECMARK_MODE_SEL:
- secmark = info->u.sel.selsid;
+ secmark = info->secid;
break;
-
default:
BUG();
}
@@ -50,33 +49,33 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static int checkentry_selinux(struct xt_secmark_target_info *info)
+static int checkentry_lsm(struct xt_secmark_target_info *info)
{
int err;
- struct xt_secmark_target_selinux_info *sel = &info->u.sel;
- sel->selctx[SECMARK_SELCTX_MAX - 1] = '\0';
+ info->secctx[SECMARK_SECCTX_MAX - 1] = '\0';
+ info->secid = 0;
- err = selinux_string_to_sid(sel->selctx, &sel->selsid);
+ err = security_secctx_to_secid(info->secctx, strlen(info->secctx),
+ &info->secid);
if (err) {
if (err == -EINVAL)
- pr_info("invalid SELinux context \'%s\'\n",
- sel->selctx);
+ pr_info("invalid security context \'%s\'\n", info->secctx);
return err;
}
- if (!sel->selsid) {
- pr_info("unable to map SELinux context \'%s\'\n", sel->selctx);
+ if (!info->secid) {
+ pr_info("unable to map security context \'%s\'\n", info->secctx);
return -ENOENT;
}
- err = selinux_secmark_relabel_packet_permission(sel->selsid);
+ err = security_secmark_relabel_packet(info->secid);
if (err) {
pr_info("unable to obtain relabeling permission\n");
return err;
}
- selinux_secmark_refcount_inc();
+ security_secmark_refcount_inc();
return 0;
}
@@ -100,16 +99,16 @@ static int secmark_tg_check(const struct xt_tgchk_param *par)
switch (info->mode) {
case SECMARK_MODE_SEL:
- err = checkentry_selinux(info);
- if (err <= 0)
- return err;
break;
-
default:
pr_info("invalid mode: %hu\n", info->mode);
return -EINVAL;
}
+ err = checkentry_lsm(info);
+ if (err)
+ return err;
+
if (!mode)
mode = info->mode;
return 0;
@@ -119,7 +118,7 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par)
{
switch (mode) {
case SECMARK_MODE_SEL:
- selinux_secmark_refcount_dec();
+ security_secmark_refcount_dec();
}
}
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index c61294d85fd..19c482caf30 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -1,7 +1,7 @@
/*
* Transparent proxy support for Linux/iptables
*
- * Copyright (c) 2006-2007 BalaBit IT Ltd.
+ * Copyright (c) 2006-2010 BalaBit IT Ltd.
* Author: Balazs Scheidler, Krisztian Kovacs
*
* This program is free software; you can redistribute it and/or modify
@@ -16,19 +16,96 @@
#include <net/checksum.h>
#include <net/udp.h>
#include <net/inet_sock.h>
-
+#include <linux/inetdevice.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter/xt_TPROXY.h>
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <net/if_inet6.h>
+#include <net/addrconf.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#endif
+
#include <net/netfilter/nf_tproxy_core.h>
+#include <linux/netfilter/xt_TPROXY.h>
+
+static inline __be32
+tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
+{
+ struct in_device *indev;
+ __be32 laddr;
+
+ if (user_laddr)
+ return user_laddr;
+
+ laddr = 0;
+ rcu_read_lock();
+ indev = __in_dev_get_rcu(skb->dev);
+ for_primary_ifa(indev) {
+ laddr = ifa->ifa_local;
+ break;
+ } endfor_ifa(indev);
+ rcu_read_unlock();
+
+ return laddr ? laddr : daddr;
+}
+
+/**
+ * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections
+ * @skb: The skb being processed.
+ * @laddr: IPv4 address to redirect to or zero.
+ * @lport: TCP port to redirect to or zero.
+ * @sk: The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait4() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+ struct sock *sk)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct tcphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ inet_twsk_put(inet_twsk(sk));
+ return NULL;
+ }
+
+ if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+ /* SYN to a TIME_WAIT socket, we'd rather redirect it
+ * to a listener socket if there's one */
+ struct sock *sk2;
+
+ sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+ iph->saddr, laddr ? laddr : iph->daddr,
+ hp->source, lport ? lport : hp->dest,
+ skb->dev, NFT_LOOKUP_LISTENER);
+ if (sk2) {
+ inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+ inet_twsk_put(inet_twsk(sk));
+ sk = sk2;
+ }
+ }
+
+ return sk;
+}
static unsigned int
-tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
+tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+ u_int32_t mark_mask, u_int32_t mark_value)
{
const struct iphdr *iph = ip_hdr(skb);
- const struct xt_tproxy_target_info *tgi = par->targinfo;
struct udphdr _hdr, *hp;
struct sock *sk;
@@ -36,12 +113,195 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
if (hp == NULL)
return NF_DROP;
+ /* check if there's an ongoing connection on the packet
+ * addresses, this happens if the redirect already happened
+ * and the current packet belongs to an already established
+ * connection */
sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
- iph->saddr,
- tgi->laddr ? tgi->laddr : iph->daddr,
- hp->source,
- tgi->lport ? tgi->lport : hp->dest,
- par->in, true);
+ iph->saddr, iph->daddr,
+ hp->source, hp->dest,
+ skb->dev, NFT_LOOKUP_ESTABLISHED);
+
+ laddr = tproxy_laddr4(skb, laddr, iph->daddr);
+ if (!lport)
+ lport = hp->dest;
+
+ /* UDP has no TCP_TIME_WAIT state, so we never enter here */
+ if (sk && sk->sk_state == TCP_TIME_WAIT)
+ /* reopening a TIME_WAIT connection needs special handling */
+ sk = tproxy_handle_time_wait4(skb, laddr, lport, sk);
+ else if (!sk)
+ /* no, there's no established connection, check if
+ * there's a listener on the redirected addr/port */
+ sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+ iph->saddr, laddr,
+ hp->source, lport,
+ skb->dev, NFT_LOOKUP_LISTENER);
+
+ /* NOTE: assign_sock consumes our sk reference */
+ if (sk && nf_tproxy_assign_sock(skb, sk)) {
+ /* This should be in a separate target, but we don't do multiple
+ targets on the same rule yet */
+ skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
+
+ pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
+ iph->protocol, &iph->daddr, ntohs(hp->dest),
+ &laddr, ntohs(lport), skb->mark);
+ return NF_ACCEPT;
+ }
+
+ pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
+ iph->protocol, &iph->saddr, ntohs(hp->source),
+ &iph->daddr, ntohs(hp->dest), skb->mark);
+ return NF_DROP;
+}
+
+static unsigned int
+tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_tproxy_target_info *tgi = par->targinfo;
+
+ return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+static unsigned int
+tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+ return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+
+static inline const struct in6_addr *
+tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
+ const struct in6_addr *daddr)
+{
+ struct inet6_dev *indev;
+ struct inet6_ifaddr *ifa;
+ struct in6_addr *laddr;
+
+ if (!ipv6_addr_any(user_laddr))
+ return user_laddr;
+ laddr = NULL;
+
+ rcu_read_lock();
+ indev = __in6_dev_get(skb->dev);
+ if (indev)
+ list_for_each_entry(ifa, &indev->addr_list, if_list) {
+ if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
+ continue;
+
+ laddr = &ifa->addr;
+ break;
+ }
+ rcu_read_unlock();
+
+ return laddr ? laddr : daddr;
+}
+
+/**
+ * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections
+ * @skb: The skb being processed.
+ * @tproto: Transport protocol.
+ * @thoff: Transport protocol header offset.
+ * @par: Iptables target parameters.
+ * @sk: The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait6() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+ const struct xt_action_param *par,
+ struct sock *sk)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct tcphdr _hdr, *hp;
+ const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+ hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ inet_twsk_put(inet_twsk(sk));
+ return NULL;
+ }
+
+ if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+ /* SYN to a TIME_WAIT socket, we'd rather redirect it
+ * to a listener socket if there's one */
+ struct sock *sk2;
+
+ sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ &iph->saddr,
+ tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
+ hp->source,
+ tgi->lport ? tgi->lport : hp->dest,
+ skb->dev, NFT_LOOKUP_LISTENER);
+ if (sk2) {
+ inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+ inet_twsk_put(inet_twsk(sk));
+ sk = sk2;
+ }
+ }
+
+ return sk;
+}
+
+static unsigned int
+tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+ struct udphdr _hdr, *hp;
+ struct sock *sk;
+ const struct in6_addr *laddr;
+ __be16 lport;
+ int thoff;
+ int tproto;
+
+ tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+ if (tproto < 0) {
+ pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+ return NF_DROP;
+ }
+
+ hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n");
+ return NF_DROP;
+ }
+
+ /* check if there's an ongoing connection on the packet
+ * addresses, this happens if the redirect already happened
+ * and the current packet belongs to an already established
+ * connection */
+ sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ &iph->saddr, &iph->daddr,
+ hp->source, hp->dest,
+ par->in, NFT_LOOKUP_ESTABLISHED);
+
+ laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
+ lport = tgi->lport ? tgi->lport : hp->dest;
+
+ /* UDP has no TCP_TIME_WAIT state, so we never enter here */
+ if (sk && sk->sk_state == TCP_TIME_WAIT)
+ /* reopening a TIME_WAIT connection needs special handling */
+ sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk);
+ else if (!sk)
+ /* no there's no established connection, check if
+ * there's a listener on the redirected addr/port */
+ sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ &iph->saddr, laddr,
+ hp->source, lport,
+ par->in, NFT_LOOKUP_LISTENER);
/* NOTE: assign_sock consumes our sk reference */
if (sk && nf_tproxy_assign_sock(skb, sk)) {
@@ -49,19 +309,34 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
targets on the same rule yet */
skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
- pr_debug("redirecting: proto %u %08x:%u -> %08x:%u, mark: %x\n",
- iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
- ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark);
+ pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
+ tproto, &iph->saddr, ntohs(hp->source),
+ laddr, ntohs(lport), skb->mark);
return NF_ACCEPT;
}
- pr_debug("no socket, dropping: proto %u %08x:%u -> %08x:%u, mark: %x\n",
- iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
- ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark);
+ pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
+ tproto, &iph->saddr, ntohs(hp->source),
+ &iph->daddr, ntohs(hp->dest), skb->mark);
+
return NF_DROP;
}
-static int tproxy_tg_check(const struct xt_tgchk_param *par)
+static int tproxy_tg6_check(const struct xt_tgchk_param *par)
+{
+ const struct ip6t_ip6 *i = par->entryinfo;
+
+ if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
+ && !(i->flags & IP6T_INV_PROTO))
+ return 0;
+
+ pr_info("Can be used only in combination with "
+ "either -p tcp or -p udp\n");
+ return -EINVAL;
+}
+#endif
+
+static int tproxy_tg4_check(const struct xt_tgchk_param *par)
{
const struct ipt_ip *i = par->entryinfo;
@@ -74,31 +349,64 @@ static int tproxy_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
-static struct xt_target tproxy_tg_reg __read_mostly = {
- .name = "TPROXY",
- .family = AF_INET,
- .table = "mangle",
- .target = tproxy_tg,
- .targetsize = sizeof(struct xt_tproxy_target_info),
- .checkentry = tproxy_tg_check,
- .hooks = 1 << NF_INET_PRE_ROUTING,
- .me = THIS_MODULE,
+static struct xt_target tproxy_tg_reg[] __read_mostly = {
+ {
+ .name = "TPROXY",
+ .family = NFPROTO_IPV4,
+ .table = "mangle",
+ .target = tproxy_tg4_v0,
+ .revision = 0,
+ .targetsize = sizeof(struct xt_tproxy_target_info),
+ .checkentry = tproxy_tg4_check,
+ .hooks = 1 << NF_INET_PRE_ROUTING,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "TPROXY",
+ .family = NFPROTO_IPV4,
+ .table = "mangle",
+ .target = tproxy_tg4_v1,
+ .revision = 1,
+ .targetsize = sizeof(struct xt_tproxy_target_info_v1),
+ .checkentry = tproxy_tg4_check,
+ .hooks = 1 << NF_INET_PRE_ROUTING,
+ .me = THIS_MODULE,
+ },
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ {
+ .name = "TPROXY",
+ .family = NFPROTO_IPV6,
+ .table = "mangle",
+ .target = tproxy_tg6_v1,
+ .revision = 1,
+ .targetsize = sizeof(struct xt_tproxy_target_info_v1),
+ .checkentry = tproxy_tg6_check,
+ .hooks = 1 << NF_INET_PRE_ROUTING,
+ .me = THIS_MODULE,
+ },
+#endif
+
};
static int __init tproxy_tg_init(void)
{
nf_defrag_ipv4_enable();
- return xt_register_target(&tproxy_tg_reg);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ nf_defrag_ipv6_enable();
+#endif
+
+ return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
}
static void __exit tproxy_tg_exit(void)
{
- xt_unregister_target(&tproxy_tg_reg);
+ xt_unregister_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
}
module_init(tproxy_tg_init);
module_exit(tproxy_tg_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Krisztian Kovacs");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module.");
MODULE_ALIAS("ipt_TPROXY");
+MODULE_ALIAS("ip6t_TPROXY");
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index b46a8390896..9228ee0dc11 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -448,6 +448,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
{
__be16 _ports[2], *ports;
u8 nexthdr;
+ int poff;
memset(dst, 0, sizeof(*dst));
@@ -492,19 +493,13 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
return 0;
}
- switch (nexthdr) {
- case IPPROTO_TCP:
- case IPPROTO_UDP:
- case IPPROTO_UDPLITE:
- case IPPROTO_SCTP:
- case IPPROTO_DCCP:
- ports = skb_header_pointer(skb, protoff, sizeof(_ports),
+ poff = proto_ports_offset(nexthdr);
+ if (poff >= 0) {
+ ports = skb_header_pointer(skb, protoff + poff, sizeof(_ports),
&_ports);
- break;
- default:
+ } else {
_ports[0] = _ports[1] = 0;
ports = _ports;
- break;
}
if (!ports)
return -1;
diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
index 7a4d66db95a..9127a3d8aa3 100644
--- a/net/netfilter/xt_ipvs.c
+++ b/net/netfilter/xt_ipvs.c
@@ -16,7 +16,6 @@
#include <linux/ip_vs.h>
#include <linux/types.h>
#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_ipvs.h>
#include <net/netfilter/nf_conntrack.h>
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 76aec6a4476..d2ff15a2412 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -567,6 +567,7 @@ static const struct file_operations recent_mt_fops = {
.write = recent_mt_proc_write,
.release = seq_release_private,
.owner = THIS_MODULE,
+ .llseek = seq_lseek,
};
static int __net_init recent_proc_net_init(struct net *net)
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 1ca89908cba..2dbd4c85773 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -14,6 +14,7 @@
#include <linux/skbuff.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/icmp.h>
@@ -21,6 +22,7 @@
#include <net/inet_sock.h>
#include <net/netfilter/nf_tproxy_core.h>
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <linux/netfilter/xt_socket.h>
@@ -30,7 +32,7 @@
#endif
static int
-extract_icmp_fields(const struct sk_buff *skb,
+extract_icmp4_fields(const struct sk_buff *skb,
u8 *protocol,
__be32 *raddr,
__be32 *laddr,
@@ -86,7 +88,6 @@ extract_icmp_fields(const struct sk_buff *skb,
return 0;
}
-
static bool
socket_match(const struct sk_buff *skb, struct xt_action_param *par,
const struct xt_socket_mtinfo1 *info)
@@ -115,7 +116,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
dport = hp->dest;
} else if (iph->protocol == IPPROTO_ICMP) {
- if (extract_icmp_fields(skb, &protocol, &saddr, &daddr,
+ if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
&sport, &dport))
return false;
} else {
@@ -142,7 +143,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
#endif
sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol,
- saddr, daddr, sport, dport, par->in, false);
+ saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
if (sk != NULL) {
bool wildcard;
bool transparent = true;
@@ -165,32 +166,157 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
sk = NULL;
}
- pr_debug("proto %u %08x:%u -> %08x:%u (orig %08x:%u) sock %p\n",
- protocol, ntohl(saddr), ntohs(sport),
- ntohl(daddr), ntohs(dport),
- ntohl(iph->daddr), hp ? ntohs(hp->dest) : 0, sk);
+ pr_debug("proto %hhu %pI4:%hu -> %pI4:%hu (orig %pI4:%hu) sock %p\n",
+ protocol, &saddr, ntohs(sport),
+ &daddr, ntohs(dport),
+ &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
return (sk != NULL);
}
static bool
-socket_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
return socket_match(skb, par, NULL);
}
static bool
-socket_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par)
{
return socket_match(skb, par, par->matchinfo);
}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+
+static int
+extract_icmp6_fields(const struct sk_buff *skb,
+ unsigned int outside_hdrlen,
+ u8 *protocol,
+ struct in6_addr **raddr,
+ struct in6_addr **laddr,
+ __be16 *rport,
+ __be16 *lport)
+{
+ struct ipv6hdr *inside_iph, _inside_iph;
+ struct icmp6hdr *icmph, _icmph;
+ __be16 *ports, _ports[2];
+ u8 inside_nexthdr;
+ int inside_hdrlen;
+
+ icmph = skb_header_pointer(skb, outside_hdrlen,
+ sizeof(_icmph), &_icmph);
+ if (icmph == NULL)
+ return 1;
+
+ if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK)
+ return 1;
+
+ inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), sizeof(_inside_iph), &_inside_iph);
+ if (inside_iph == NULL)
+ return 1;
+ inside_nexthdr = inside_iph->nexthdr;
+
+ inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), &inside_nexthdr);
+ if (inside_hdrlen < 0)
+ return 1; /* hjm: Packet has no/incomplete transport layer headers. */
+
+ if (inside_nexthdr != IPPROTO_TCP &&
+ inside_nexthdr != IPPROTO_UDP)
+ return 1;
+
+ ports = skb_header_pointer(skb, inside_hdrlen,
+ sizeof(_ports), &_ports);
+ if (ports == NULL)
+ return 1;
+
+ /* the inside IP packet is the one quoted from our side, thus
+ * its saddr is the local address */
+ *protocol = inside_nexthdr;
+ *laddr = &inside_iph->saddr;
+ *lport = ports[0];
+ *raddr = &inside_iph->daddr;
+ *rport = ports[1];
+
+ return 0;
+}
+
+static bool
+socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct udphdr _hdr, *hp = NULL;
+ struct sock *sk;
+ struct in6_addr *daddr, *saddr;
+ __be16 dport, sport;
+ int thoff;
+ u8 tproto;
+ const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
+
+ tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+ if (tproto < 0) {
+ pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+ return NF_DROP;
+ }
+
+ if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) {
+ hp = skb_header_pointer(skb, thoff,
+ sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return false;
+
+ saddr = &iph->saddr;
+ sport = hp->source;
+ daddr = &iph->daddr;
+ dport = hp->dest;
+
+ } else if (tproto == IPPROTO_ICMPV6) {
+ if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
+ &sport, &dport))
+ return false;
+ } else {
+ return false;
+ }
+
+ sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+ saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
+ if (sk != NULL) {
+ bool wildcard;
+ bool transparent = true;
+
+ /* Ignore sockets listening on INADDR_ANY */
+ wildcard = (sk->sk_state != TCP_TIME_WAIT &&
+ ipv6_addr_any(&inet6_sk(sk)->rcv_saddr));
+
+ /* Ignore non-transparent sockets,
+ if XT_SOCKET_TRANSPARENT is used */
+ if (info && info->flags & XT_SOCKET_TRANSPARENT)
+ transparent = ((sk->sk_state != TCP_TIME_WAIT &&
+ inet_sk(sk)->transparent) ||
+ (sk->sk_state == TCP_TIME_WAIT &&
+ inet_twsk(sk)->tw_transparent));
+
+ nf_tproxy_put_sock(sk);
+
+ if (wildcard || !transparent)
+ sk = NULL;
+ }
+
+ pr_debug("proto %hhu %pI6:%hu -> %pI6:%hu "
+ "(orig %pI6:%hu) sock %p\n",
+ tproto, saddr, ntohs(sport),
+ daddr, ntohs(dport),
+ &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
+
+ return (sk != NULL);
+}
+#endif
+
static struct xt_match socket_mt_reg[] __read_mostly = {
{
.name = "socket",
.revision = 0,
.family = NFPROTO_IPV4,
- .match = socket_mt_v0,
+ .match = socket_mt4_v0,
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
@@ -199,17 +325,33 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.name = "socket",
.revision = 1,
.family = NFPROTO_IPV4,
- .match = socket_mt_v1,
+ .match = socket_mt4_v1,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
},
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ {
+ .name = "socket",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .match = socket_mt6_v1,
+ .matchsize = sizeof(struct xt_socket_mtinfo1),
+ .hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init socket_mt_init(void)
{
nf_defrag_ipv4_enable();
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ nf_defrag_ipv6_enable();
+#endif
+
return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg));
}
@@ -225,3 +367,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
MODULE_DESCRIPTION("x_tables socket match module");
MODULE_ALIAS("ipt_socket");
+MODULE_ALIAS("ip6t_socket");
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 2cbf380377d..cd96ed3ccee 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1406,7 +1406,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
struct netlink_sock *nlk = nlk_sk(sk);
int noblock = flags&MSG_DONTWAIT;
size_t copied;
- struct sk_buff *skb;
+ struct sk_buff *skb, *data_skb;
int err;
if (flags&MSG_OOB)
@@ -1418,59 +1418,35 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
if (skb == NULL)
goto out;
+ data_skb = skb;
+
#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
if (unlikely(skb_shinfo(skb)->frag_list)) {
- bool need_compat = !!(flags & MSG_CMSG_COMPAT);
-
/*
- * If this skb has a frag_list, then here that means that
- * we will have to use the frag_list skb for compat tasks
- * and the regular skb for non-compat tasks.
+ * If this skb has a frag_list, then here that means that we
+ * will have to use the frag_list skb's data for compat tasks
+ * and the regular skb's data for normal (non-compat) tasks.
*
- * The skb might (and likely will) be cloned, so we can't
- * just reset frag_list and go on with things -- we need to
- * keep that. For the compat case that's easy -- simply get
- * a reference to the compat skb and free the regular one
- * including the frag. For the non-compat case, we need to
- * avoid sending the frag to the user -- so assign NULL but
- * restore it below before freeing the skb.
+ * If we need to send the compat skb, assign it to the
+ * 'data_skb' variable so that it will be used below for data
+ * copying. We keep 'skb' for everything else, including
+ * freeing both later.
*/
- if (need_compat) {
- struct sk_buff *compskb = skb_shinfo(skb)->frag_list;
- skb_get(compskb);
- kfree_skb(skb);
- skb = compskb;
- } else {
- /*
- * Before setting frag_list to NULL, we must get a
- * private copy of skb if shared (because of MSG_PEEK)
- */
- if (skb_shared(skb)) {
- struct sk_buff *nskb;
-
- nskb = pskb_copy(skb, GFP_KERNEL);
- kfree_skb(skb);
- skb = nskb;
- err = -ENOMEM;
- if (!skb)
- goto out;
- }
- kfree_skb(skb_shinfo(skb)->frag_list);
- skb_shinfo(skb)->frag_list = NULL;
- }
+ if (flags & MSG_CMSG_COMPAT)
+ data_skb = skb_shinfo(skb)->frag_list;
}
#endif
msg->msg_namelen = 0;
- copied = skb->len;
+ copied = data_skb->len;
if (len < copied) {
msg->msg_flags |= MSG_TRUNC;
copied = len;
}
- skb_reset_transport_header(skb);
- err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+ skb_reset_transport_header(data_skb);
+ err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied);
if (msg->msg_name) {
struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name;
@@ -1490,7 +1466,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
}
siocb->scm->creds = *NETLINK_CREDS(skb);
if (flags & MSG_TRUNC)
- copied = skb->len;
+ copied = data_skb->len;
skb_free_datagram(sk, skb);
@@ -2126,6 +2102,26 @@ static void __net_exit netlink_net_exit(struct net *net)
#endif
}
+static void __init netlink_add_usersock_entry(void)
+{
+ unsigned long *listeners;
+ int groups = 32;
+
+ listeners = kzalloc(NLGRPSZ(groups) + sizeof(struct listeners_rcu_head),
+ GFP_KERNEL);
+ if (!listeners)
+ panic("netlink_add_usersock_entry: Cannot allocate listneres\n");
+
+ netlink_table_grab();
+
+ nl_table[NETLINK_USERSOCK].groups = groups;
+ nl_table[NETLINK_USERSOCK].listeners = listeners;
+ nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
+ nl_table[NETLINK_USERSOCK].registered = 1;
+
+ netlink_table_ungrab();
+}
+
static struct pernet_operations __net_initdata netlink_net_ops = {
.init = netlink_net_init,
.exit = netlink_net_exit,
@@ -2174,6 +2170,8 @@ static int __init netlink_proto_init(void)
hash->rehash_time = jiffies;
}
+ netlink_add_usersock_entry();
+
sock_register(&netlink_family_ops);
register_pernet_subsys(&netlink_net_ops);
/* The netlink device handler may be needed early. */
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 26ed3e8587c..1781d99145e 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -547,8 +547,20 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN;
info.attrs = family->attrbuf;
genl_info_net_set(&info, net);
+ memset(&info.user_ptr, 0, sizeof(info.user_ptr));
- return ops->doit(skb, &info);
+ if (family->pre_doit) {
+ err = family->pre_doit(ops, skb, &info);
+ if (err)
+ return err;
+ }
+
+ err = ops->doit(skb, &info);
+
+ if (family->post_doit)
+ family->post_doit(ops, skb, &info);
+
+ return err;
}
static void genl_rcv(struct sk_buff *skb)
diff --git a/net/nonet.c b/net/nonet.c
index 92e76640c7c..b1a73fda9c1 100644
--- a/net/nonet.c
+++ b/net/nonet.c
@@ -22,4 +22,5 @@ static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
const struct file_operations bad_sock_fops = {
.owner = THIS_MODULE,
.open = sock_no_open,
+ .llseek = noop_llseek,
};
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 9a17f28b125..3616f27b9d4 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -488,7 +488,7 @@ retry:
skb->dev = dev;
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
- err = sock_tx_timestamp(msg, sk, skb_tx(skb));
+ err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
if (err < 0)
goto out_unlock;
@@ -1209,7 +1209,7 @@ static int packet_snd(struct socket *sock,
err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
if (err)
goto out_free;
- err = sock_tx_timestamp(msg, sk, skb_tx(skb));
+ err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
if (err < 0)
goto out_free;
diff --git a/net/phonet/Kconfig b/net/phonet/Kconfig
index 6ec7d55b176..0d9b8a220a7 100644
--- a/net/phonet/Kconfig
+++ b/net/phonet/Kconfig
@@ -14,3 +14,15 @@ config PHONET
To compile this driver as a module, choose M here: the module
will be called phonet. If unsure, say N.
+
+config PHONET_PIPECTRLR
+ bool "Phonet Pipe Controller (EXPERIMENTAL)"
+ depends on PHONET && EXPERIMENTAL
+ default N
+ help
+ The Pipe Controller implementation in Phonet stack to support Pipe
+ data with Nokia Slim modems like WG2.5 used on ST-Ericsson U8500
+ platform.
+
+ This option is incompatible with older Nokia modems.
+ Say N here unless you really know what you are doing.
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 73aee7f2fcd..fd95beb72f5 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -251,6 +251,16 @@ int pn_skb_send(struct sock *sk, struct sk_buff *skb,
else if (phonet_address_lookup(net, daddr) == 0) {
dev = phonet_device_get(net);
skb->pkt_type = PACKET_LOOPBACK;
+ } else if (pn_sockaddr_get_object(target) == 0) {
+ /* Resource routing (small race until phonet_rcv()) */
+ struct sock *sk = pn_find_sock_by_res(net,
+ target->spn_resource);
+ if (sk) {
+ sock_put(sk);
+ dev = phonet_device_get(net);
+ skb->pkt_type = PACKET_LOOPBACK;
+ } else
+ dev = phonet_route_output(net, daddr);
} else
dev = phonet_route_output(net, daddr);
@@ -383,6 +393,13 @@ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev,
goto out;
}
+ /* resource routing */
+ if (pn_sockaddr_get_object(&sa) == 0) {
+ struct sock *sk = pn_find_sock_by_res(net, sa.spn_resource);
+ if (sk)
+ return sk_receive_skb(sk, skb, 0);
+ }
+
/* check if we are the destination */
if (phonet_address_lookup(net, pn_sockaddr_get_addr(&sa)) == 0) {
/* Phonet packet input */
diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c
index 1bd38db4fe1..2f032381bd4 100644
--- a/net/phonet/datagram.c
+++ b/net/phonet/datagram.c
@@ -52,6 +52,19 @@ static int pn_ioctl(struct sock *sk, int cmd, unsigned long arg)
answ = skb ? skb->len : 0;
release_sock(sk);
return put_user(answ, (int __user *)arg);
+
+ case SIOCPNADDRESOURCE:
+ case SIOCPNDELRESOURCE: {
+ u32 res;
+ if (get_user(res, (u32 __user *)arg))
+ return -EFAULT;
+ if (res >= 256)
+ return -EINVAL;
+ if (cmd == SIOCPNADDRESOURCE)
+ return pn_sock_bind_res(sk, res);
+ else
+ return pn_sock_unbind_res(sk, res);
+ }
}
return -ENOIOCTLCMD;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index b2a3ae6cad7..3e60f2e4e6c 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -109,6 +109,210 @@ static int pep_reply(struct sock *sk, struct sk_buff *oskb,
}
#define PAD 0x00
+
+#ifdef CONFIG_PHONET_PIPECTRLR
+static u8 pipe_negotiate_fc(u8 *host_fc, u8 *remote_fc, int len)
+{
+ int i, j;
+ u8 base_fc, final_fc;
+
+ for (i = 0; i < len; i++) {
+ base_fc = host_fc[i];
+ for (j = 0; j < len; j++) {
+ if (remote_fc[j] == base_fc) {
+ final_fc = base_fc;
+ goto done;
+ }
+ }
+ }
+ return -EINVAL;
+
+done:
+ return final_fc;
+
+}
+
+static int pipe_get_flow_info(struct sock *sk, struct sk_buff *skb,
+ u8 *pref_rx_fc, u8 *req_tx_fc)
+{
+ struct pnpipehdr *hdr;
+ u8 n_sb;
+
+ if (!pskb_may_pull(skb, sizeof(*hdr) + 4))
+ return -EINVAL;
+
+ hdr = pnp_hdr(skb);
+ n_sb = hdr->data[4];
+
+ __skb_pull(skb, sizeof(*hdr) + 4);
+ while (n_sb > 0) {
+ u8 type, buf[3], len = sizeof(buf);
+ u8 *data = pep_get_sb(skb, &type, &len, buf);
+
+ if (data == NULL)
+ return -EINVAL;
+
+ switch (type) {
+ case PN_PIPE_SB_REQUIRED_FC_TX:
+ if (len < 3 || (data[2] | data[3] | data[4]) > 3)
+ break;
+ req_tx_fc[0] = data[2];
+ req_tx_fc[1] = data[3];
+ req_tx_fc[2] = data[4];
+ break;
+
+ case PN_PIPE_SB_PREFERRED_FC_RX:
+ if (len < 3 || (data[2] | data[3] | data[4]) > 3)
+ break;
+ pref_rx_fc[0] = data[2];
+ pref_rx_fc[1] = data[3];
+ pref_rx_fc[2] = data[4];
+ break;
+
+ }
+ n_sb--;
+ }
+ return 0;
+}
+
+static int pipe_handler_send_req(struct sock *sk, u8 utid,
+ u8 msg_id, gfp_t priority)
+{
+ int len;
+ struct pnpipehdr *ph;
+ struct sk_buff *skb;
+ struct pep_sock *pn = pep_sk(sk);
+
+ static const u8 data[4] = {
+ PAD, PAD, PAD, PAD,
+ };
+
+ switch (msg_id) {
+ case PNS_PEP_CONNECT_REQ:
+ len = sizeof(data);
+ break;
+
+ case PNS_PEP_DISCONNECT_REQ:
+ case PNS_PEP_ENABLE_REQ:
+ case PNS_PEP_DISABLE_REQ:
+ len = 0;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ skb = alloc_skb(MAX_PNPIPE_HEADER + len, priority);
+ if (!skb)
+ return -ENOMEM;
+ skb_set_owner_w(skb, sk);
+
+ skb_reserve(skb, MAX_PNPIPE_HEADER);
+ if (len) {
+ __skb_put(skb, len);
+ skb_copy_to_linear_data(skb, data, len);
+ }
+ __skb_push(skb, sizeof(*ph));
+ skb_reset_transport_header(skb);
+ ph = pnp_hdr(skb);
+ ph->utid = utid;
+ ph->message_id = msg_id;
+ ph->pipe_handle = pn->pipe_handle;
+ ph->error_code = PN_PIPE_NO_ERROR;
+
+ return pn_skb_send(sk, skb, &pn->remote_pep);
+}
+
+static int pipe_handler_send_created_ind(struct sock *sk,
+ u8 utid, u8 msg_id)
+{
+ int err_code;
+ struct pnpipehdr *ph;
+ struct sk_buff *skb;
+
+ struct pep_sock *pn = pep_sk(sk);
+ static u8 data[4] = {
+ 0x03, 0x04,
+ };
+ data[2] = pn->tx_fc;
+ data[3] = pn->rx_fc;
+
+ /*
+ * actually, below is number of sub-blocks and not error code.
+ * Pipe_created_ind message format does not have any
+ * error code field. However, the Phonet stack will always send
+ * an error code as part of pnpipehdr. So, use that err_code to
+ * specify the number of sub-blocks.
+ */
+ err_code = 0x01;
+
+ skb = alloc_skb(MAX_PNPIPE_HEADER + sizeof(data), GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+ skb_set_owner_w(skb, sk);
+
+ skb_reserve(skb, MAX_PNPIPE_HEADER);
+ __skb_put(skb, sizeof(data));
+ skb_copy_to_linear_data(skb, data, sizeof(data));
+ __skb_push(skb, sizeof(*ph));
+ skb_reset_transport_header(skb);
+ ph = pnp_hdr(skb);
+ ph->utid = utid;
+ ph->message_id = msg_id;
+ ph->pipe_handle = pn->pipe_handle;
+ ph->error_code = err_code;
+
+ return pn_skb_send(sk, skb, &pn->remote_pep);
+}
+
+static int pipe_handler_send_ind(struct sock *sk, u8 utid, u8 msg_id)
+{
+ int err_code;
+ struct pnpipehdr *ph;
+ struct sk_buff *skb;
+ struct pep_sock *pn = pep_sk(sk);
+
+ /*
+ * actually, below is a filler.
+ * Pipe_enabled/disabled_ind message format does not have any
+ * error code field. However, the Phonet stack will always send
+ * an error code as part of pnpipehdr. So, use that err_code to
+ * specify the filler value.
+ */
+ err_code = 0x0;
+
+ skb = alloc_skb(MAX_PNPIPE_HEADER, GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+ skb_set_owner_w(skb, sk);
+
+ skb_reserve(skb, MAX_PNPIPE_HEADER);
+ __skb_push(skb, sizeof(*ph));
+ skb_reset_transport_header(skb);
+ ph = pnp_hdr(skb);
+ ph->utid = utid;
+ ph->message_id = msg_id;
+ ph->pipe_handle = pn->pipe_handle;
+ ph->error_code = err_code;
+
+ return pn_skb_send(sk, skb, &pn->remote_pep);
+}
+
+static int pipe_handler_enable_pipe(struct sock *sk, int enable)
+{
+ int utid, req;
+
+ if (enable) {
+ utid = PNS_PIPE_ENABLE_UTID;
+ req = PNS_PEP_ENABLE_REQ;
+ } else {
+ utid = PNS_PIPE_DISABLE_UTID;
+ req = PNS_PEP_DISABLE_REQ;
+ }
+ return pipe_handler_send_req(sk, utid, req, GFP_ATOMIC);
+}
+#endif
+
static int pep_accept_conn(struct sock *sk, struct sk_buff *skb)
{
static const u8 data[20] = {
@@ -192,7 +396,11 @@ static int pipe_snd_status(struct sock *sk, u8 type, u8 status, gfp_t priority)
ph->data[3] = PAD;
ph->data[4] = status;
+#ifdef CONFIG_PHONET_PIPECTRLR
+ return pn_skb_send(sk, skb, &pn->remote_pep);
+#else
return pn_skb_send(sk, skb, &pipe_srv);
+#endif
}
/* Send our RX flow control information to the sender.
@@ -225,12 +433,13 @@ static void pipe_grant_credits(struct sock *sk)
static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb)
{
struct pep_sock *pn = pep_sk(sk);
- struct pnpipehdr *hdr = pnp_hdr(skb);
+ struct pnpipehdr *hdr;
int wake = 0;
if (!pskb_may_pull(skb, sizeof(*hdr) + 4))
return -EINVAL;
+ hdr = pnp_hdr(skb);
if (hdr->data[0] != PN_PEP_TYPE_COMMON) {
LIMIT_NETDEBUG(KERN_DEBUG"Phonet unknown PEP type: %u\n",
(unsigned)hdr->data[0]);
@@ -323,11 +532,35 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
sk->sk_state_change(sk);
break;
+#ifdef CONFIG_PHONET_PIPECTRLR
+ case PNS_PEP_DISCONNECT_RESP:
+ pn->pipe_state = PIPE_IDLE;
+ sk->sk_state = TCP_CLOSE;
+ break;
+#endif
+
case PNS_PEP_ENABLE_REQ:
/* Wait for PNS_PIPE_(ENABLED|REDIRECTED)_IND */
pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
break;
+#ifdef CONFIG_PHONET_PIPECTRLR
+ case PNS_PEP_ENABLE_RESP:
+ pn->pipe_state = PIPE_ENABLED;
+ pipe_handler_send_ind(sk, PNS_PIPE_ENABLED_IND_UTID,
+ PNS_PIPE_ENABLED_IND);
+
+ if (!pn_flow_safe(pn->tx_fc)) {
+ atomic_set(&pn->tx_credits, 1);
+ sk->sk_write_space(sk);
+ }
+ if (sk->sk_state == TCP_ESTABLISHED)
+ break; /* Nothing to do */
+ sk->sk_state = TCP_ESTABLISHED;
+ pipe_grant_credits(sk);
+ break;
+#endif
+
case PNS_PEP_RESET_REQ:
switch (hdr->state_after_reset) {
case PN_PIPE_DISABLE:
@@ -346,6 +579,17 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
break;
+#ifdef CONFIG_PHONET_PIPECTRLR
+ case PNS_PEP_DISABLE_RESP:
+ pn->pipe_state = PIPE_DISABLED;
+ atomic_set(&pn->tx_credits, 0);
+ pipe_handler_send_ind(sk, PNS_PIPE_DISABLED_IND_UTID,
+ PNS_PIPE_DISABLED_IND);
+ sk->sk_state = TCP_SYN_RECV;
+ pn->rx_credits = 0;
+ break;
+#endif
+
case PNS_PEP_CTRL_REQ:
if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) {
atomic_inc(&sk->sk_drops);
@@ -437,6 +681,42 @@ static void pipe_destruct(struct sock *sk)
skb_queue_purge(&pn->ctrlreq_queue);
}
+#ifdef CONFIG_PHONET_PIPECTRLR
+static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ u8 host_pref_rx_fc[3] = {3, 2, 1}, host_req_tx_fc[3] = {3, 2, 1};
+ u8 remote_pref_rx_fc[3], remote_req_tx_fc[3];
+ u8 negotiated_rx_fc, negotiated_tx_fc;
+ int ret;
+
+ pipe_get_flow_info(sk, skb, remote_pref_rx_fc,
+ remote_req_tx_fc);
+ negotiated_tx_fc = pipe_negotiate_fc(remote_req_tx_fc,
+ host_pref_rx_fc,
+ sizeof(host_pref_rx_fc));
+ negotiated_rx_fc = pipe_negotiate_fc(host_req_tx_fc,
+ remote_pref_rx_fc,
+ sizeof(host_pref_rx_fc));
+
+ pn->pipe_state = PIPE_DISABLED;
+ sk->sk_state = TCP_SYN_RECV;
+ sk->sk_backlog_rcv = pipe_do_rcv;
+ sk->sk_destruct = pipe_destruct;
+ pn->rx_credits = 0;
+ pn->rx_fc = negotiated_rx_fc;
+ pn->tx_fc = negotiated_tx_fc;
+ sk->sk_state_change(sk);
+
+ ret = pipe_handler_send_created_ind(sk,
+ PNS_PIPE_CREATED_IND_UTID,
+ PNS_PIPE_CREATED_IND
+ );
+
+ return ret;
+}
+#endif
+
static int pep_connreq_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *newsk;
@@ -600,6 +880,12 @@ static int pep_do_rcv(struct sock *sk, struct sk_buff *skb)
err = pep_connreq_rcv(sk, skb);
break;
+#ifdef CONFIG_PHONET_PIPECTRLR
+ case PNS_PEP_CONNECT_RESP:
+ err = pep_connresp_rcv(sk, skb);
+ break;
+#endif
+
case PNS_PEP_DISCONNECT_REQ:
pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
break;
@@ -620,6 +906,28 @@ drop:
return err;
}
+static int pipe_do_remove(struct sock *sk)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ struct pnpipehdr *ph;
+ struct sk_buff *skb;
+
+ skb = alloc_skb(MAX_PNPIPE_HEADER, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ skb_reserve(skb, MAX_PNPIPE_HEADER);
+ __skb_push(skb, sizeof(*ph));
+ skb_reset_transport_header(skb);
+ ph = pnp_hdr(skb);
+ ph->utid = 0;
+ ph->message_id = PNS_PIPE_REMOVE_REQ;
+ ph->pipe_handle = pn->pipe_handle;
+ ph->data[0] = PAD;
+
+ return pn_skb_send(sk, skb, &pipe_srv);
+}
+
/* associated socket ceases to exist */
static void pep_sock_close(struct sock *sk, long timeout)
{
@@ -638,7 +946,22 @@ static void pep_sock_close(struct sock *sk, long timeout)
sk_for_each_safe(sknode, p, n, &pn->ackq)
sk_del_node_init(sknode);
sk->sk_state = TCP_CLOSE;
+ } else if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED))
+ /* Forcefully remove dangling Phonet pipe */
+ pipe_do_remove(sk);
+
+#ifdef CONFIG_PHONET_PIPECTRLR
+ if (pn->pipe_state != PIPE_IDLE) {
+ /* send pep disconnect request */
+ pipe_handler_send_req(sk,
+ PNS_PEP_DISCONNECT_UTID, PNS_PEP_DISCONNECT_REQ,
+ GFP_KERNEL);
+
+ pn->pipe_state = PIPE_IDLE;
+ sk->sk_state = TCP_CLOSE;
}
+#endif
+
ifindex = pn->ifindex;
pn->ifindex = 0;
release_sock(sk);
@@ -715,6 +1038,20 @@ out:
return newsk;
}
+#ifdef CONFIG_PHONET_PIPECTRLR
+static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ struct sockaddr_pn *spn = (struct sockaddr_pn *)addr;
+
+ memcpy(&pn->remote_pep, spn, sizeof(struct sockaddr_pn));
+
+ return pipe_handler_send_req(sk,
+ PNS_PEP_CONNECT_UTID, PNS_PEP_CONNECT_REQ,
+ GFP_ATOMIC);
+}
+#endif
+
static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
struct pep_sock *pn = pep_sk(sk);
@@ -766,6 +1103,18 @@ static int pep_setsockopt(struct sock *sk, int level, int optname,
lock_sock(sk);
switch (optname) {
+#ifdef CONFIG_PHONET_PIPECTRLR
+ case PNPIPE_PIPE_HANDLE:
+ if (val) {
+ if (pn->pipe_state > PIPE_IDLE) {
+ err = -EFAULT;
+ break;
+ }
+ pn->pipe_handle = val;
+ break;
+ }
+#endif
+
case PNPIPE_ENCAP:
if (val && val != PNPIPE_ENCAP_IP) {
err = -EINVAL;
@@ -791,6 +1140,17 @@ static int pep_setsockopt(struct sock *sk, int level, int optname,
err = 0;
}
goto out_norel;
+
+#ifdef CONFIG_PHONET_PIPECTRLR
+ case PNPIPE_ENABLE:
+ if (pn->pipe_state <= PIPE_IDLE) {
+ err = -ENOTCONN;
+ break;
+ }
+ err = pipe_handler_enable_pipe(sk, val);
+ break;
+#endif
+
default:
err = -ENOPROTOOPT;
}
@@ -815,9 +1175,19 @@ static int pep_getsockopt(struct sock *sk, int level, int optname,
case PNPIPE_ENCAP:
val = pn->ifindex ? PNPIPE_ENCAP_IP : PNPIPE_ENCAP_NONE;
break;
+
case PNPIPE_IFINDEX:
val = pn->ifindex;
break;
+
+#ifdef CONFIG_PHONET_PIPECTRLR
+ case PNPIPE_ENABLE:
+ if (pn->pipe_state <= PIPE_IDLE)
+ return -ENOTCONN;
+ val = pn->pipe_state != PIPE_DISABLED;
+ break;
+#endif
+
default:
return -ENOPROTOOPT;
}
@@ -834,6 +1204,7 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb)
{
struct pep_sock *pn = pep_sk(sk);
struct pnpipehdr *ph;
+ int err;
if (pn_flow_safe(pn->tx_fc) &&
!atomic_add_unless(&pn->tx_credits, -1, 0)) {
@@ -851,8 +1222,16 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb)
} else
ph->message_id = PNS_PIPE_DATA;
ph->pipe_handle = pn->pipe_handle;
+#ifdef CONFIG_PHONET_PIPECTRLR
+ err = pn_skb_send(sk, skb, &pn->remote_pep);
+#else
+ err = pn_skb_send(sk, skb, &pipe_srv);
+#endif
+
+ if (err && pn_flow_safe(pn->tx_fc))
+ atomic_inc(&pn->tx_credits);
+ return err;
- return pn_skb_send(sk, skb, &pipe_srv);
}
static int pep_sendmsg(struct kiocb *iocb, struct sock *sk,
@@ -872,7 +1251,7 @@ static int pep_sendmsg(struct kiocb *iocb, struct sock *sk,
skb = sock_alloc_send_skb(sk, MAX_PNPIPE_HEADER + len,
flags & MSG_DONTWAIT, &err);
if (!skb)
- return -ENOBUFS;
+ return err;
skb_reserve(skb, MAX_PHONET_HEADER + 3);
err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
@@ -1044,6 +1423,8 @@ static void pep_sock_unhash(struct sock *sk)
struct sock *skparent = NULL;
lock_sock(sk);
+
+#ifndef CONFIG_PHONET_PIPECTRLR
if ((1 << sk->sk_state) & ~(TCPF_CLOSE|TCPF_LISTEN)) {
skparent = pn->listener;
release_sock(sk);
@@ -1053,6 +1434,7 @@ static void pep_sock_unhash(struct sock *sk)
sk_del_node_init(sk);
sk = skparent;
}
+#endif
/* Unhash a listening sock only when it is closed
* and all of its active connected pipes are closed. */
if (hlist_empty(&pn->hlist))
@@ -1066,6 +1448,9 @@ static void pep_sock_unhash(struct sock *sk)
static struct proto pep_proto = {
.close = pep_sock_close,
.accept = pep_sock_accept,
+#ifdef CONFIG_PHONET_PIPECTRLR
+ .connect = pep_sock_connect,
+#endif
.ioctl = pep_ioctl,
.init = pep_init,
.setsockopt = pep_setsockopt,
diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c
index b18e48fae97..947038ddd04 100644
--- a/net/phonet/pn_dev.c
+++ b/net/phonet/pn_dev.c
@@ -292,8 +292,7 @@ static void phonet_route_autodel(struct net_device *dev)
if (bitmap_empty(deleted, 64))
return; /* short-circuit RCU */
synchronize_rcu();
- for (i = find_first_bit(deleted, 64); i < 64;
- i = find_next_bit(deleted, 64, i + 1)) {
+ for_each_set_bit(i, deleted, 64) {
rtm_phonet_notify(RTM_DELROUTE, dev, i);
dev_put(dev);
}
@@ -374,6 +373,7 @@ int __init phonet_device_init(void)
if (err)
return err;
+ proc_net_fops_create(&init_net, "pnresource", 0, &pn_res_seq_fops);
register_netdevice_notifier(&phonet_device_notifier);
err = phonet_netlink_register();
if (err)
@@ -386,6 +386,7 @@ void phonet_device_exit(void)
rtnl_unregister_all(PF_PHONET);
unregister_netdevice_notifier(&phonet_device_notifier);
unregister_pernet_device(&phonet_net_ops);
+ proc_net_remove(&init_net, "pnresource");
}
int phonet_route_add(struct net_device *dev, u8 daddr)
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 6e9848bf037..25f746d20c1 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -158,6 +158,7 @@ void pn_sock_unhash(struct sock *sk)
spin_lock_bh(&pnsocks.lock);
sk_del_node_init(sk);
spin_unlock_bh(&pnsocks.lock);
+ pn_sock_unbind_all_res(sk);
}
EXPORT_SYMBOL(pn_sock_unhash);
@@ -224,6 +225,101 @@ static int pn_socket_autobind(struct socket *sock)
return 0; /* socket was already bound */
}
+#ifdef CONFIG_PHONET_PIPECTRLR
+static int pn_socket_connect(struct socket *sock, struct sockaddr *addr,
+ int len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sockaddr_pn *spn = (struct sockaddr_pn *)addr;
+ long timeo;
+ int err;
+
+ if (len < sizeof(struct sockaddr_pn))
+ return -EINVAL;
+ if (spn->spn_family != AF_PHONET)
+ return -EAFNOSUPPORT;
+
+ lock_sock(sk);
+
+ switch (sock->state) {
+ case SS_UNCONNECTED:
+ sk->sk_state = TCP_CLOSE;
+ break;
+ case SS_CONNECTING:
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ sock->state = SS_CONNECTED;
+ err = -EISCONN;
+ goto out;
+ case TCP_CLOSE:
+ err = -EALREADY;
+ if (flags & O_NONBLOCK)
+ goto out;
+ goto wait_connect;
+ }
+ break;
+ case SS_CONNECTED:
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ err = -EISCONN;
+ goto out;
+ case TCP_CLOSE:
+ sock->state = SS_UNCONNECTED;
+ break;
+ }
+ break;
+ case SS_DISCONNECTING:
+ case SS_FREE:
+ break;
+ }
+ sk->sk_state = TCP_CLOSE;
+ sk_stream_kill_queues(sk);
+
+ sock->state = SS_CONNECTING;
+ err = sk->sk_prot->connect(sk, addr, len);
+ if (err < 0) {
+ sock->state = SS_UNCONNECTED;
+ sk->sk_state = TCP_CLOSE;
+ goto out;
+ }
+
+ err = -EINPROGRESS;
+wait_connect:
+ if (sk->sk_state != TCP_SYN_RECV && (flags & O_NONBLOCK))
+ goto out;
+
+ timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+ release_sock(sk);
+
+ err = -ERESTARTSYS;
+ timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
+ sk->sk_state != TCP_CLOSE,
+ timeo);
+
+ lock_sock(sk);
+ if (timeo < 0)
+ goto out; /* -ERESTARTSYS */
+
+ err = -ETIMEDOUT;
+ if (timeo == 0 && sk->sk_state != TCP_SYN_RECV)
+ goto out;
+
+ if (sk->sk_state != TCP_SYN_RECV) {
+ sock->state = SS_UNCONNECTED;
+ err = sock_error(sk);
+ if (!err)
+ err = -ECONNREFUSED;
+ goto out;
+ }
+ sock->state = SS_CONNECTED;
+ err = 0;
+
+out:
+ release_sock(sk);
+ return err;
+}
+#endif
+
static int pn_socket_accept(struct socket *sock, struct socket *newsock,
int flags)
{
@@ -281,7 +377,9 @@ static unsigned int pn_socket_poll(struct file *file, struct socket *sock,
if (!mask && sk->sk_state == TCP_CLOSE_WAIT)
return POLLHUP;
- if (sk->sk_state == TCP_ESTABLISHED && atomic_read(&pn->tx_credits))
+ if (sk->sk_state == TCP_ESTABLISHED &&
+ atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf &&
+ atomic_read(&pn->tx_credits))
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
return mask;
@@ -390,7 +488,11 @@ const struct proto_ops phonet_stream_ops = {
.owner = THIS_MODULE,
.release = pn_socket_release,
.bind = pn_socket_bind,
+#ifdef CONFIG_PHONET_PIPECTRLR
+ .connect = pn_socket_connect,
+#else
.connect = sock_no_connect,
+#endif
.socketpair = sock_no_socketpair,
.accept = pn_socket_accept,
.getname = pn_socket_getname,
@@ -563,3 +665,188 @@ const struct file_operations pn_sock_seq_fops = {
.release = seq_release_net,
};
#endif
+
+static struct {
+ struct sock *sk[256];
+} pnres;
+
+/*
+ * Find and hold socket based on resource.
+ */
+struct sock *pn_find_sock_by_res(struct net *net, u8 res)
+{
+ struct sock *sk;
+
+ if (!net_eq(net, &init_net))
+ return NULL;
+
+ rcu_read_lock();
+ sk = rcu_dereference(pnres.sk[res]);
+ if (sk)
+ sock_hold(sk);
+ rcu_read_unlock();
+ return sk;
+}
+
+static DEFINE_MUTEX(resource_mutex);
+
+int pn_sock_bind_res(struct sock *sk, u8 res)
+{
+ int ret = -EADDRINUSE;
+
+ if (!net_eq(sock_net(sk), &init_net))
+ return -ENOIOCTLCMD;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (pn_socket_autobind(sk->sk_socket))
+ return -EAGAIN;
+
+ mutex_lock(&resource_mutex);
+ if (pnres.sk[res] == NULL) {
+ sock_hold(sk);
+ rcu_assign_pointer(pnres.sk[res], sk);
+ ret = 0;
+ }
+ mutex_unlock(&resource_mutex);
+ return ret;
+}
+
+int pn_sock_unbind_res(struct sock *sk, u8 res)
+{
+ int ret = -ENOENT;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&resource_mutex);
+ if (pnres.sk[res] == sk) {
+ rcu_assign_pointer(pnres.sk[res], NULL);
+ ret = 0;
+ }
+ mutex_unlock(&resource_mutex);
+
+ if (ret == 0) {
+ synchronize_rcu();
+ sock_put(sk);
+ }
+ return ret;
+}
+
+void pn_sock_unbind_all_res(struct sock *sk)
+{
+ unsigned res, match = 0;
+
+ mutex_lock(&resource_mutex);
+ for (res = 0; res < 256; res++) {
+ if (pnres.sk[res] == sk) {
+ rcu_assign_pointer(pnres.sk[res], NULL);
+ match++;
+ }
+ }
+ mutex_unlock(&resource_mutex);
+
+ if (match == 0)
+ return;
+ synchronize_rcu();
+ while (match > 0) {
+ sock_put(sk);
+ match--;
+ }
+}
+
+#ifdef CONFIG_PROC_FS
+static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct net *net = seq_file_net(seq);
+ unsigned i;
+
+ if (!net_eq(net, &init_net))
+ return NULL;
+
+ for (i = 0; i < 256; i++) {
+ if (pnres.sk[i] == NULL)
+ continue;
+ if (!pos)
+ return pnres.sk + i;
+ pos--;
+ }
+ return NULL;
+}
+
+static struct sock **pn_res_get_next(struct seq_file *seq, struct sock **sk)
+{
+ struct net *net = seq_file_net(seq);
+ unsigned i;
+
+ BUG_ON(!net_eq(net, &init_net));
+
+ for (i = (sk - pnres.sk) + 1; i < 256; i++)
+ if (pnres.sk[i])
+ return pnres.sk + i;
+ return NULL;
+}
+
+static void *pn_res_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(resource_mutex)
+{
+ mutex_lock(&resource_mutex);
+ return *pos ? pn_res_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *pn_res_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct sock **sk;
+
+ if (v == SEQ_START_TOKEN)
+ sk = pn_res_get_idx(seq, 0);
+ else
+ sk = pn_res_get_next(seq, v);
+ (*pos)++;
+ return sk;
+}
+
+static void pn_res_seq_stop(struct seq_file *seq, void *v)
+ __releases(resource_mutex)
+{
+ mutex_unlock(&resource_mutex);
+}
+
+static int pn_res_seq_show(struct seq_file *seq, void *v)
+{
+ int len;
+
+ if (v == SEQ_START_TOKEN)
+ seq_printf(seq, "%s%n", "rs uid inode", &len);
+ else {
+ struct sock **psk = v;
+ struct sock *sk = *psk;
+
+ seq_printf(seq, "%02X %5d %lu%n",
+ (int) (psk - pnres.sk), sock_i_uid(sk),
+ sock_i_ino(sk), &len);
+ }
+ seq_printf(seq, "%*s\n", 63 - len, "");
+ return 0;
+}
+
+static const struct seq_operations pn_res_seq_ops = {
+ .start = pn_res_seq_start,
+ .next = pn_res_seq_next,
+ .stop = pn_res_seq_stop,
+ .show = pn_res_seq_show,
+};
+
+static int pn_res_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &pn_res_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+const struct file_operations pn_res_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = pn_res_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+#endif
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index aebfecbdb84..bb6ad81b671 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -39,7 +39,15 @@
#include <net/sock.h>
#include "rds.h"
-#include "rdma.h"
+
+char *rds_str_array(char **array, size_t elements, size_t index)
+{
+ if ((index < elements) && array[index])
+ return array[index];
+ else
+ return "unknown";
+}
+EXPORT_SYMBOL(rds_str_array);
/* this is just used for stats gathering :/ */
static DEFINE_SPINLOCK(rds_sock_lock);
@@ -62,7 +70,7 @@ static int rds_release(struct socket *sock)
struct rds_sock *rs;
unsigned long flags;
- if (sk == NULL)
+ if (!sk)
goto out;
rs = rds_sk_to_rs(sk);
@@ -73,7 +81,15 @@ static int rds_release(struct socket *sock)
* with the socket. */
rds_clear_recv_queue(rs);
rds_cong_remove_socket(rs);
+
+ /*
+ * the binding lookup hash uses rcu, we need to
+ * make sure we sychronize_rcu before we free our
+ * entry
+ */
rds_remove_bound(rs);
+ synchronize_rcu();
+
rds_send_drop_to(rs, NULL);
rds_rdma_drop_keys(rs);
rds_notify_queue_get(rs, NULL);
@@ -83,6 +99,8 @@ static int rds_release(struct socket *sock)
rds_sock_count--;
spin_unlock_irqrestore(&rds_sock_lock, flags);
+ rds_trans_put(rs->rs_transport);
+
sock->sk = NULL;
sock_put(sk);
out:
@@ -514,7 +532,7 @@ out:
spin_unlock_irqrestore(&rds_sock_lock, flags);
}
-static void __exit rds_exit(void)
+static void rds_exit(void)
{
sock_unregister(rds_family_ops.family);
proto_unregister(&rds_proto);
@@ -529,7 +547,7 @@ static void __exit rds_exit(void)
}
module_exit(rds_exit);
-static int __init rds_init(void)
+static int rds_init(void)
{
int ret;
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 5d95fc007f1..2f6b3fcc79f 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -34,45 +34,52 @@
#include <net/sock.h>
#include <linux/in.h>
#include <linux/if_arp.h>
+#include <linux/jhash.h>
#include "rds.h"
-/*
- * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't
- * particularly zippy.
- *
- * This is now called for every incoming frame so we arguably care much more
- * about it than we used to.
- */
+#define BIND_HASH_SIZE 1024
+static struct hlist_head bind_hash_table[BIND_HASH_SIZE];
static DEFINE_SPINLOCK(rds_bind_lock);
-static struct rb_root rds_bind_tree = RB_ROOT;
-static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
- struct rds_sock *insert)
+static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port)
+{
+ return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) &
+ (BIND_HASH_SIZE - 1));
+}
+
+static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
+ struct rds_sock *insert)
{
- struct rb_node **p = &rds_bind_tree.rb_node;
- struct rb_node *parent = NULL;
struct rds_sock *rs;
+ struct hlist_node *node;
+ struct hlist_head *head = hash_to_bucket(addr, port);
u64 cmp;
u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
- while (*p) {
- parent = *p;
- rs = rb_entry(parent, struct rds_sock, rs_bound_node);
-
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) {
cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
be16_to_cpu(rs->rs_bound_port);
- if (needle < cmp)
- p = &(*p)->rb_left;
- else if (needle > cmp)
- p = &(*p)->rb_right;
- else
+ if (cmp == needle) {
+ rcu_read_unlock();
return rs;
+ }
}
+ rcu_read_unlock();
if (insert) {
- rb_link_node(&insert->rs_bound_node, parent, p);
- rb_insert_color(&insert->rs_bound_node, &rds_bind_tree);
+ /*
+ * make sure our addr and port are set before
+ * we are added to the list, other people
+ * in rcu will find us as soon as the
+ * hlist_add_head_rcu is done
+ */
+ insert->rs_bound_addr = addr;
+ insert->rs_bound_port = port;
+ rds_sock_addref(insert);
+
+ hlist_add_head_rcu(&insert->rs_bound_node, head);
}
return NULL;
}
@@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
{
struct rds_sock *rs;
- unsigned long flags;
- spin_lock_irqsave(&rds_bind_lock, flags);
- rs = rds_bind_tree_walk(addr, port, NULL);
+ rs = rds_bind_lookup(addr, port, NULL);
+
if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
rds_sock_addref(rs);
else
rs = NULL;
- spin_unlock_irqrestore(&rds_bind_lock, flags);
rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
ntohs(port));
@@ -121,22 +126,15 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
do {
if (rover == 0)
rover++;
- if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) {
- *port = cpu_to_be16(rover);
+ if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) {
+ *port = rs->rs_bound_port;
ret = 0;
+ rdsdebug("rs %p binding to %pI4:%d\n",
+ rs, &addr, (int)ntohs(*port));
break;
}
} while (rover++ != last);
- if (ret == 0) {
- rs->rs_bound_addr = addr;
- rs->rs_bound_port = *port;
- rds_sock_addref(rs);
-
- rdsdebug("rs %p binding to %pI4:%d\n",
- rs, &addr, (int)ntohs(*port));
- }
-
spin_unlock_irqrestore(&rds_bind_lock, flags);
return ret;
@@ -153,7 +151,7 @@ void rds_remove_bound(struct rds_sock *rs)
rs, &rs->rs_bound_addr,
ntohs(rs->rs_bound_port));
- rb_erase(&rs->rs_bound_node, &rds_bind_tree);
+ hlist_del_init_rcu(&rs->rs_bound_node);
rds_sock_put(rs);
rs->rs_bound_addr = 0;
}
@@ -184,7 +182,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out;
trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
- if (trans == NULL) {
+ if (!trans) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
if (printk_ratelimit())
@@ -198,5 +196,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
out:
release_sock(sk);
+
+ /* we might have called rds_remove_bound on error */
+ if (ret)
+ synchronize_rcu();
return ret;
}
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 0871a29f078..75ea686f27d 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -141,7 +141,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
unsigned long flags;
map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
- if (map == NULL)
+ if (!map)
return NULL;
map->m_addr = addr;
@@ -159,7 +159,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
ret = rds_cong_tree_walk(addr, map);
spin_unlock_irqrestore(&rds_cong_lock, flags);
- if (ret == NULL) {
+ if (!ret) {
ret = map;
map = NULL;
}
@@ -205,7 +205,7 @@ int rds_cong_get_maps(struct rds_connection *conn)
conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
- if (conn->c_lcong == NULL || conn->c_fcong == NULL)
+ if (!(conn->c_lcong && conn->c_fcong))
return -ENOMEM;
return 0;
@@ -221,7 +221,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
if (!test_and_set_bit(0, &conn->c_map_queued)) {
rds_stats_inc(s_cong_update_queued);
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ rds_send_xmit(conn);
}
}
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 7619b671ca2..9334d892366 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -37,7 +37,6 @@
#include "rds.h"
#include "loop.h"
-#include "rdma.h"
#define RDS_CONNECTION_HASH_BITS 12
#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
@@ -63,18 +62,7 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
var |= RDS_INFO_CONNECTION_FLAG_##suffix; \
} while (0)
-static inline int rds_conn_is_sending(struct rds_connection *conn)
-{
- int ret = 0;
-
- if (!mutex_trylock(&conn->c_send_lock))
- ret = 1;
- else
- mutex_unlock(&conn->c_send_lock);
-
- return ret;
-}
-
+/* rcu read lock must be held or the connection spinlock */
static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
__be32 laddr, __be32 faddr,
struct rds_transport *trans)
@@ -82,7 +70,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
struct rds_connection *conn, *ret = NULL;
struct hlist_node *pos;
- hlist_for_each_entry(conn, pos, head, c_hash_node) {
+ hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
conn->c_trans == trans) {
ret = conn;
@@ -100,7 +88,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
* and receiving over this connection again in the future. It is up to
* the transport to have serialized this call with its send and recv.
*/
-void rds_conn_reset(struct rds_connection *conn)
+static void rds_conn_reset(struct rds_connection *conn)
{
rdsdebug("connection %pI4 to %pI4 reset\n",
&conn->c_laddr, &conn->c_faddr);
@@ -129,10 +117,11 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
{
struct rds_connection *conn, *parent = NULL;
struct hlist_head *head = rds_conn_bucket(laddr, faddr);
+ struct rds_transport *loop_trans;
unsigned long flags;
int ret;
- spin_lock_irqsave(&rds_conn_lock, flags);
+ rcu_read_lock();
conn = rds_conn_lookup(head, laddr, faddr, trans);
if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
!is_outgoing) {
@@ -143,12 +132,12 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
parent = conn;
conn = parent->c_passive;
}
- spin_unlock_irqrestore(&rds_conn_lock, flags);
+ rcu_read_unlock();
if (conn)
goto out;
conn = kmem_cache_zalloc(rds_conn_slab, gfp);
- if (conn == NULL) {
+ if (!conn) {
conn = ERR_PTR(-ENOMEM);
goto out;
}
@@ -159,7 +148,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
spin_lock_init(&conn->c_lock);
conn->c_next_tx_seq = 1;
- mutex_init(&conn->c_send_lock);
+ init_waitqueue_head(&conn->c_waitq);
INIT_LIST_HEAD(&conn->c_send_queue);
INIT_LIST_HEAD(&conn->c_retrans);
@@ -175,7 +164,9 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
* can bind to the destination address then we'd rather the messages
* flow through loopback rather than either transport.
*/
- if (rds_trans_get_preferred(faddr)) {
+ loop_trans = rds_trans_get_preferred(faddr);
+ if (loop_trans) {
+ rds_trans_put(loop_trans);
conn->c_loopback = 1;
if (is_outgoing && trans->t_prefer_loopback) {
/* "outgoing" connection - and the transport
@@ -238,7 +229,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
kmem_cache_free(rds_conn_slab, conn);
conn = found;
} else {
- hlist_add_head(&conn->c_hash_node, head);
+ hlist_add_head_rcu(&conn->c_hash_node, head);
rds_cong_add_conn(conn);
rds_conn_count++;
}
@@ -263,21 +254,91 @@ struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
}
EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
+void rds_conn_shutdown(struct rds_connection *conn)
+{
+ /* shut it down unless it's down already */
+ if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
+ /*
+ * Quiesce the connection mgmt handlers before we start tearing
+ * things down. We don't hold the mutex for the entire
+ * duration of the shutdown operation, else we may be
+ * deadlocking with the CM handler. Instead, the CM event
+ * handler is supposed to check for state DISCONNECTING
+ */
+ mutex_lock(&conn->c_cm_lock);
+ if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
+ && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
+ rds_conn_error(conn, "shutdown called in state %d\n",
+ atomic_read(&conn->c_state));
+ mutex_unlock(&conn->c_cm_lock);
+ return;
+ }
+ mutex_unlock(&conn->c_cm_lock);
+
+ wait_event(conn->c_waitq,
+ !test_bit(RDS_IN_XMIT, &conn->c_flags));
+
+ conn->c_trans->conn_shutdown(conn);
+ rds_conn_reset(conn);
+
+ if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
+ /* This can happen - eg when we're in the middle of tearing
+ * down the connection, and someone unloads the rds module.
+ * Quite reproduceable with loopback connections.
+ * Mostly harmless.
+ */
+ rds_conn_error(conn,
+ "%s: failed to transition to state DOWN, "
+ "current state is %d\n",
+ __func__,
+ atomic_read(&conn->c_state));
+ return;
+ }
+ }
+
+ /* Then reconnect if it's still live.
+ * The passive side of an IB loopback connection is never added
+ * to the conn hash, so we never trigger a reconnect on this
+ * conn - the reconnect is always triggered by the active peer. */
+ cancel_delayed_work_sync(&conn->c_conn_w);
+ rcu_read_lock();
+ if (!hlist_unhashed(&conn->c_hash_node)) {
+ rcu_read_unlock();
+ rds_queue_reconnect(conn);
+ } else {
+ rcu_read_unlock();
+ }
+}
+
+/*
+ * Stop and free a connection.
+ *
+ * This can only be used in very limited circumstances. It assumes that once
+ * the conn has been shutdown that no one else is referencing the connection.
+ * We can only ensure this in the rmmod path in the current code.
+ */
void rds_conn_destroy(struct rds_connection *conn)
{
struct rds_message *rm, *rtmp;
+ unsigned long flags;
rdsdebug("freeing conn %p for %pI4 -> "
"%pI4\n", conn, &conn->c_laddr,
&conn->c_faddr);
- hlist_del_init(&conn->c_hash_node);
+ /* Ensure conn will not be scheduled for reconnect */
+ spin_lock_irq(&rds_conn_lock);
+ hlist_del_init_rcu(&conn->c_hash_node);
+ spin_unlock_irq(&rds_conn_lock);
+ synchronize_rcu();
- /* wait for the rds thread to shut it down */
- atomic_set(&conn->c_state, RDS_CONN_ERROR);
- cancel_delayed_work(&conn->c_conn_w);
- queue_work(rds_wq, &conn->c_down_w);
- flush_workqueue(rds_wq);
+ /* shut the connection down */
+ rds_conn_drop(conn);
+ flush_work(&conn->c_down_w);
+
+ /* make sure lingering queued work won't try to ref the conn */
+ cancel_delayed_work_sync(&conn->c_send_w);
+ cancel_delayed_work_sync(&conn->c_recv_w);
/* tear down queued messages */
list_for_each_entry_safe(rm, rtmp,
@@ -302,7 +363,9 @@ void rds_conn_destroy(struct rds_connection *conn)
BUG_ON(!list_empty(&conn->c_retrans));
kmem_cache_free(rds_conn_slab, conn);
+ spin_lock_irqsave(&rds_conn_lock, flags);
rds_conn_count--;
+ spin_unlock_irqrestore(&rds_conn_lock, flags);
}
EXPORT_SYMBOL_GPL(rds_conn_destroy);
@@ -316,23 +379,23 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
struct list_head *list;
struct rds_connection *conn;
struct rds_message *rm;
- unsigned long flags;
unsigned int total = 0;
+ unsigned long flags;
size_t i;
len /= sizeof(struct rds_info_message);
- spin_lock_irqsave(&rds_conn_lock, flags);
+ rcu_read_lock();
for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
i++, head++) {
- hlist_for_each_entry(conn, pos, head, c_hash_node) {
+ hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
if (want_send)
list = &conn->c_send_queue;
else
list = &conn->c_retrans;
- spin_lock(&conn->c_lock);
+ spin_lock_irqsave(&conn->c_lock, flags);
/* XXX too lazy to maintain counts.. */
list_for_each_entry(rm, list, m_conn_item) {
@@ -343,11 +406,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
conn->c_faddr, 0);
}
- spin_unlock(&conn->c_lock);
+ spin_unlock_irqrestore(&conn->c_lock, flags);
}
}
-
- spin_unlock_irqrestore(&rds_conn_lock, flags);
+ rcu_read_unlock();
lens->nr = total;
lens->each = sizeof(struct rds_info_message);
@@ -377,19 +439,17 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
uint64_t buffer[(item_len + 7) / 8];
struct hlist_head *head;
struct hlist_node *pos;
- struct hlist_node *tmp;
struct rds_connection *conn;
- unsigned long flags;
size_t i;
- spin_lock_irqsave(&rds_conn_lock, flags);
+ rcu_read_lock();
lens->nr = 0;
lens->each = item_len;
for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
i++, head++) {
- hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) {
+ hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
/* XXX no c_lock usage.. */
if (!visitor(conn, buffer))
@@ -405,8 +465,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
lens->nr++;
}
}
-
- spin_unlock_irqrestore(&rds_conn_lock, flags);
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
@@ -423,8 +482,8 @@ static int rds_conn_info_visitor(struct rds_connection *conn,
sizeof(cinfo->transport));
cinfo->flags = 0;
- rds_conn_info_set(cinfo->flags,
- rds_conn_is_sending(conn), SENDING);
+ rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags),
+ SENDING);
/* XXX Future: return the state rather than these funky bits */
rds_conn_info_set(cinfo->flags,
atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
@@ -444,12 +503,12 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
sizeof(struct rds_info_connection));
}
-int __init rds_conn_init(void)
+int rds_conn_init(void)
{
rds_conn_slab = kmem_cache_create("rds_connection",
sizeof(struct rds_connection),
0, 0, NULL);
- if (rds_conn_slab == NULL)
+ if (!rds_conn_slab)
return -ENOMEM;
rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
@@ -487,6 +546,18 @@ void rds_conn_drop(struct rds_connection *conn)
EXPORT_SYMBOL_GPL(rds_conn_drop);
/*
+ * If the connection is down, trigger a connect. We may have scheduled a
+ * delayed reconnect however - in this case we should not interfere.
+ */
+void rds_conn_connect_if_down(struct rds_connection *conn)
+{
+ if (rds_conn_state(conn) == RDS_CONN_DOWN &&
+ !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+ queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+}
+EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
+
+/*
* An error occurred on the connection
*/
void
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 8f2d6dd7700..4123967d4d6 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -42,7 +42,7 @@
#include "rds.h"
#include "ib.h"
-unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
+static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
@@ -53,13 +53,72 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
module_param(rds_ib_retry_count, int, 0444);
MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
+/*
+ * we have a clumsy combination of RCU and a rwsem protecting this list
+ * because it is used both in the get_mr fast path and while blocking in
+ * the FMR flushing path.
+ */
+DECLARE_RWSEM(rds_ib_devices_lock);
struct list_head rds_ib_devices;
/* NOTE: if also grabbing ibdev lock, grab this first */
DEFINE_SPINLOCK(ib_nodev_conns_lock);
LIST_HEAD(ib_nodev_conns);
-void rds_ib_add_one(struct ib_device *device)
+static void rds_ib_nodev_connect(void)
+{
+ struct rds_ib_connection *ic;
+
+ spin_lock(&ib_nodev_conns_lock);
+ list_for_each_entry(ic, &ib_nodev_conns, ib_node)
+ rds_conn_connect_if_down(ic->conn);
+ spin_unlock(&ib_nodev_conns_lock);
+}
+
+static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
+{
+ struct rds_ib_connection *ic;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rds_ibdev->spinlock, flags);
+ list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
+ rds_conn_drop(ic->conn);
+ spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
+}
+
+/*
+ * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
+ * from interrupt context so we push freing off into a work struct in krdsd.
+ */
+static void rds_ib_dev_free(struct work_struct *work)
+{
+ struct rds_ib_ipaddr *i_ipaddr, *i_next;
+ struct rds_ib_device *rds_ibdev = container_of(work,
+ struct rds_ib_device, free_work);
+
+ if (rds_ibdev->mr_pool)
+ rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
+ if (rds_ibdev->mr)
+ ib_dereg_mr(rds_ibdev->mr);
+ if (rds_ibdev->pd)
+ ib_dealloc_pd(rds_ibdev->pd);
+
+ list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
+ list_del(&i_ipaddr->list);
+ kfree(i_ipaddr);
+ }
+
+ kfree(rds_ibdev);
+}
+
+void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
+{
+ BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
+ if (atomic_dec_and_test(&rds_ibdev->refcount))
+ queue_work(rds_wq, &rds_ibdev->free_work);
+}
+
+static void rds_ib_add_one(struct ib_device *device)
{
struct rds_ib_device *rds_ibdev;
struct ib_device_attr *dev_attr;
@@ -77,11 +136,14 @@ void rds_ib_add_one(struct ib_device *device)
goto free_attr;
}
- rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL);
+ rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
+ ibdev_to_node(device));
if (!rds_ibdev)
goto free_attr;
spin_lock_init(&rds_ibdev->spinlock);
+ atomic_set(&rds_ibdev->refcount, 1);
+ INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
rds_ibdev->max_wrs = dev_attr->max_qp_wr;
rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
@@ -91,68 +153,107 @@ void rds_ib_add_one(struct ib_device *device)
min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
fmr_pool_size;
+ rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
+ rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
+
rds_ibdev->dev = device;
rds_ibdev->pd = ib_alloc_pd(device);
- if (IS_ERR(rds_ibdev->pd))
- goto free_dev;
+ if (IS_ERR(rds_ibdev->pd)) {
+ rds_ibdev->pd = NULL;
+ goto put_dev;
+ }
- rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
- IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(rds_ibdev->mr))
- goto err_pd;
+ rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(rds_ibdev->mr)) {
+ rds_ibdev->mr = NULL;
+ goto put_dev;
+ }
rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
if (IS_ERR(rds_ibdev->mr_pool)) {
rds_ibdev->mr_pool = NULL;
- goto err_mr;
+ goto put_dev;
}
INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
INIT_LIST_HEAD(&rds_ibdev->conn_list);
- list_add_tail(&rds_ibdev->list, &rds_ib_devices);
+
+ down_write(&rds_ib_devices_lock);
+ list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
+ up_write(&rds_ib_devices_lock);
+ atomic_inc(&rds_ibdev->refcount);
ib_set_client_data(device, &rds_ib_client, rds_ibdev);
+ atomic_inc(&rds_ibdev->refcount);
- goto free_attr;
+ rds_ib_nodev_connect();
-err_mr:
- ib_dereg_mr(rds_ibdev->mr);
-err_pd:
- ib_dealloc_pd(rds_ibdev->pd);
-free_dev:
- kfree(rds_ibdev);
+put_dev:
+ rds_ib_dev_put(rds_ibdev);
free_attr:
kfree(dev_attr);
}
-void rds_ib_remove_one(struct ib_device *device)
+/*
+ * New connections use this to find the device to associate with the
+ * connection. It's not in the fast path so we're not concerned about the
+ * performance of the IB call. (As of this writing, it uses an interrupt
+ * blocking spinlock to serialize walking a per-device list of all registered
+ * clients.)
+ *
+ * RCU is used to handle incoming connections racing with device teardown.
+ * Rather than use a lock to serialize removal from the client_data and
+ * getting a new reference, we use an RCU grace period. The destruction
+ * path removes the device from client_data and then waits for all RCU
+ * readers to finish.
+ *
+ * A new connection can get NULL from this if its arriving on a
+ * device that is in the process of being removed.
+ */
+struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
{
struct rds_ib_device *rds_ibdev;
- struct rds_ib_ipaddr *i_ipaddr, *i_next;
+ rcu_read_lock();
rds_ibdev = ib_get_client_data(device, &rds_ib_client);
- if (!rds_ibdev)
- return;
+ if (rds_ibdev)
+ atomic_inc(&rds_ibdev->refcount);
+ rcu_read_unlock();
+ return rds_ibdev;
+}
- list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
- list_del(&i_ipaddr->list);
- kfree(i_ipaddr);
- }
+/*
+ * The IB stack is letting us know that a device is going away. This can
+ * happen if the underlying HCA driver is removed or if PCI hotplug is removing
+ * the pci function, for example.
+ *
+ * This can be called at any time and can be racing with any other RDS path.
+ */
+static void rds_ib_remove_one(struct ib_device *device)
+{
+ struct rds_ib_device *rds_ibdev;
- rds_ib_destroy_conns(rds_ibdev);
+ rds_ibdev = ib_get_client_data(device, &rds_ib_client);
+ if (!rds_ibdev)
+ return;
- if (rds_ibdev->mr_pool)
- rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
+ rds_ib_dev_shutdown(rds_ibdev);
- ib_dereg_mr(rds_ibdev->mr);
+ /* stop connection attempts from getting a reference to this device. */
+ ib_set_client_data(device, &rds_ib_client, NULL);
- while (ib_dealloc_pd(rds_ibdev->pd)) {
- rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd);
- msleep(1);
- }
+ down_write(&rds_ib_devices_lock);
+ list_del_rcu(&rds_ibdev->list);
+ up_write(&rds_ib_devices_lock);
- list_del(&rds_ibdev->list);
- kfree(rds_ibdev);
+ /*
+ * This synchronize rcu is waiting for readers of both the ib
+ * client data and the devices list to finish before we drop
+ * both of those references.
+ */
+ synchronize_rcu();
+ rds_ib_dev_put(rds_ibdev);
+ rds_ib_dev_put(rds_ibdev);
}
struct ib_client rds_ib_client = {
@@ -186,7 +287,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
- rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+ rds_ibdev = ic->rds_ibdev;
iinfo->max_send_wr = ic->i_send_ring.w_nr;
iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
iinfo->max_send_sge = rds_ibdev->max_sge;
@@ -248,29 +349,36 @@ static int rds_ib_laddr_check(__be32 addr)
return ret;
}
+static void rds_ib_unregister_client(void)
+{
+ ib_unregister_client(&rds_ib_client);
+ /* wait for rds_ib_dev_free() to complete */
+ flush_workqueue(rds_wq);
+}
+
void rds_ib_exit(void)
{
rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+ rds_ib_unregister_client();
rds_ib_destroy_nodev_conns();
- ib_unregister_client(&rds_ib_client);
rds_ib_sysctl_exit();
rds_ib_recv_exit();
rds_trans_unregister(&rds_ib_transport);
+ rds_ib_fmr_exit();
}
struct rds_transport rds_ib_transport = {
.laddr_check = rds_ib_laddr_check,
.xmit_complete = rds_ib_xmit_complete,
.xmit = rds_ib_xmit,
- .xmit_cong_map = NULL,
.xmit_rdma = rds_ib_xmit_rdma,
+ .xmit_atomic = rds_ib_xmit_atomic,
.recv = rds_ib_recv,
.conn_alloc = rds_ib_conn_alloc,
.conn_free = rds_ib_conn_free,
.conn_connect = rds_ib_conn_connect,
.conn_shutdown = rds_ib_conn_shutdown,
.inc_copy_to_user = rds_ib_inc_copy_to_user,
- .inc_purge = rds_ib_inc_purge,
.inc_free = rds_ib_inc_free,
.cm_initiate_connect = rds_ib_cm_initiate_connect,
.cm_handle_connect = rds_ib_cm_handle_connect,
@@ -286,16 +394,20 @@ struct rds_transport rds_ib_transport = {
.t_type = RDS_TRANS_IB
};
-int __init rds_ib_init(void)
+int rds_ib_init(void)
{
int ret;
INIT_LIST_HEAD(&rds_ib_devices);
- ret = ib_register_client(&rds_ib_client);
+ ret = rds_ib_fmr_init();
if (ret)
goto out;
+ ret = ib_register_client(&rds_ib_client);
+ if (ret)
+ goto out_fmr_exit;
+
ret = rds_ib_sysctl_init();
if (ret)
goto out_ibreg;
@@ -317,7 +429,9 @@ out_recv:
out_sysctl:
rds_ib_sysctl_exit();
out_ibreg:
- ib_unregister_client(&rds_ib_client);
+ rds_ib_unregister_client();
+out_fmr_exit:
+ rds_ib_fmr_exit();
out:
return ret;
}
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 64df4e79b29..e34ad032b66 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -3,11 +3,13 @@
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
#include "rds.h"
#include "rdma_transport.h"
#define RDS_FMR_SIZE 256
-#define RDS_FMR_POOL_SIZE 4096
+#define RDS_FMR_POOL_SIZE 8192
#define RDS_IB_MAX_SGE 8
#define RDS_IB_RECV_SGE 2
@@ -19,6 +21,9 @@
#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
+#define RDS_IB_RECYCLE_BATCH_COUNT 32
+
+extern struct rw_semaphore rds_ib_devices_lock;
extern struct list_head rds_ib_devices;
/*
@@ -26,20 +31,29 @@ extern struct list_head rds_ib_devices;
* try and minimize the amount of memory tied up both the device and
* socket receive queues.
*/
-/* page offset of the final full frag that fits in the page */
-#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
struct rds_page_frag {
struct list_head f_item;
- struct page *f_page;
- unsigned long f_offset;
- dma_addr_t f_mapped;
+ struct list_head f_cache_entry;
+ struct scatterlist f_sg;
};
struct rds_ib_incoming {
struct list_head ii_frags;
+ struct list_head ii_cache_entry;
struct rds_incoming ii_inc;
};
+struct rds_ib_cache_head {
+ struct list_head *first;
+ unsigned long count;
+};
+
+struct rds_ib_refill_cache {
+ struct rds_ib_cache_head *percpu;
+ struct list_head *xfer;
+ struct list_head *ready;
+};
+
struct rds_ib_connect_private {
/* Add new fields at the end, and don't permute existing fields. */
__be32 dp_saddr;
@@ -53,8 +67,7 @@ struct rds_ib_connect_private {
};
struct rds_ib_send_work {
- struct rds_message *s_rm;
- struct rds_rdma_op *s_op;
+ void *s_op;
struct ib_send_wr s_wr;
struct ib_sge s_sge[RDS_IB_MAX_SGE];
unsigned long s_queued;
@@ -92,10 +105,11 @@ struct rds_ib_connection {
/* tx */
struct rds_ib_work_ring i_send_ring;
- struct rds_message *i_rm;
+ struct rm_data_op *i_data_op;
struct rds_header *i_send_hdrs;
u64 i_send_hdrs_dma;
struct rds_ib_send_work *i_sends;
+ atomic_t i_signaled_sends;
/* rx */
struct tasklet_struct i_recv_tasklet;
@@ -106,8 +120,9 @@ struct rds_ib_connection {
struct rds_header *i_recv_hdrs;
u64 i_recv_hdrs_dma;
struct rds_ib_recv_work *i_recvs;
- struct rds_page_frag i_frag;
u64 i_ack_recv; /* last ACK received */
+ struct rds_ib_refill_cache i_cache_incs;
+ struct rds_ib_refill_cache i_cache_frags;
/* sending acks */
unsigned long i_ack_flags;
@@ -138,7 +153,6 @@ struct rds_ib_connection {
/* Batched completions */
unsigned int i_unsignaled_wrs;
- long i_unsignaled_bytes;
};
/* This assumes that atomic_t is at least 32 bits */
@@ -164,9 +178,17 @@ struct rds_ib_device {
unsigned int max_fmrs;
int max_sge;
unsigned int max_wrs;
+ unsigned int max_initiator_depth;
+ unsigned int max_responder_resources;
spinlock_t spinlock; /* protect the above */
+ atomic_t refcount;
+ struct work_struct free_work;
};
+#define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus)
+#define ibdev_to_node(ibdev) pcidev_to_node(to_pci_dev(ibdev->dma_device))
+#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
+
/* bits for i_ack_flags */
#define IB_ACK_IN_FLIGHT 0
#define IB_ACK_REQUESTED 1
@@ -202,6 +224,8 @@ struct rds_ib_statistics {
uint64_t s_ib_rdma_mr_pool_flush;
uint64_t s_ib_rdma_mr_pool_wait;
uint64_t s_ib_rdma_mr_pool_depleted;
+ uint64_t s_ib_atomic_cswp;
+ uint64_t s_ib_atomic_fadd;
};
extern struct workqueue_struct *rds_ib_wq;
@@ -241,11 +265,10 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
/* ib.c */
extern struct rds_transport rds_ib_transport;
-extern void rds_ib_add_one(struct ib_device *device);
-extern void rds_ib_remove_one(struct ib_device *device);
+struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
+void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
extern struct ib_client rds_ib_client;
-extern unsigned int fmr_pool_size;
extern unsigned int fmr_message_size;
extern unsigned int rds_ib_retry_count;
@@ -258,7 +281,7 @@ void rds_ib_conn_free(void *arg);
int rds_ib_conn_connect(struct rds_connection *conn);
void rds_ib_conn_shutdown(struct rds_connection *conn);
void rds_ib_state_change(struct sock *sk);
-int __init rds_ib_listen_init(void);
+int rds_ib_listen_init(void);
void rds_ib_listen_stop(void);
void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
@@ -275,15 +298,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
-void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock);
-static inline void rds_ib_destroy_nodev_conns(void)
-{
- __rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock);
-}
-static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev)
-{
- __rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock);
-}
+void rds_ib_destroy_nodev_conns(void);
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
@@ -292,14 +307,16 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
void rds_ib_sync_mr(void *trans_private, int dir);
void rds_ib_free_mr(void *trans_private, int invalidate);
void rds_ib_flush_mrs(void);
+int rds_ib_fmr_init(void);
+void rds_ib_fmr_exit(void);
/* ib_recv.c */
-int __init rds_ib_recv_init(void);
+int rds_ib_recv_init(void);
void rds_ib_recv_exit(void);
int rds_ib_recv(struct rds_connection *conn);
-int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
- gfp_t page_gfp, int prefill);
-void rds_ib_inc_purge(struct rds_incoming *inc);
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill);
void rds_ib_inc_free(struct rds_incoming *inc);
int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
size_t size);
@@ -325,17 +342,19 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
extern wait_queue_head_t rds_ib_ring_empty_wait;
/* ib_send.c */
+char *rds_ib_wc_status_str(enum ib_wc_status status);
void rds_ib_xmit_complete(struct rds_connection *conn);
int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off);
void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
void rds_ib_send_init_ring(struct rds_ib_connection *ic);
void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
-int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
u32 *adv_credits, int need_posted, int max_posted);
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
/* ib_stats.c */
DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
@@ -344,7 +363,7 @@ unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail);
/* ib_sysctl.c */
-int __init rds_ib_sysctl_init(void);
+int rds_ib_sysctl_init(void);
void rds_ib_sysctl_exit(void);
extern unsigned long rds_ib_sysctl_max_send_wr;
extern unsigned long rds_ib_sysctl_max_recv_wr;
@@ -352,30 +371,5 @@ extern unsigned long rds_ib_sysctl_max_unsig_wrs;
extern unsigned long rds_ib_sysctl_max_unsig_bytes;
extern unsigned long rds_ib_sysctl_max_recv_allocation;
extern unsigned int rds_ib_sysctl_flow_control;
-extern ctl_table rds_ib_sysctl_table[];
-
-/*
- * Helper functions for getting/setting the header and data SGEs in
- * RDS packets (not RDMA)
- *
- * From version 3.1 onwards, header is in front of data in the sge.
- */
-static inline struct ib_sge *
-rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
-{
- if (ic->conn->c_version > RDS_PROTOCOL_3_0)
- return &sge[0];
- else
- return &sge[1];
-}
-
-static inline struct ib_sge *
-rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
-{
- if (ic->conn->c_version > RDS_PROTOCOL_3_0)
- return &sge[1];
- else
- return &sge[0];
-}
#endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index f68832798db..ee369d201a6 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -38,6 +38,36 @@
#include "rds.h"
#include "ib.h"
+static char *rds_ib_event_type_strings[] = {
+#define RDS_IB_EVENT_STRING(foo) \
+ [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
+ RDS_IB_EVENT_STRING(CQ_ERR),
+ RDS_IB_EVENT_STRING(QP_FATAL),
+ RDS_IB_EVENT_STRING(QP_REQ_ERR),
+ RDS_IB_EVENT_STRING(QP_ACCESS_ERR),
+ RDS_IB_EVENT_STRING(COMM_EST),
+ RDS_IB_EVENT_STRING(SQ_DRAINED),
+ RDS_IB_EVENT_STRING(PATH_MIG),
+ RDS_IB_EVENT_STRING(PATH_MIG_ERR),
+ RDS_IB_EVENT_STRING(DEVICE_FATAL),
+ RDS_IB_EVENT_STRING(PORT_ACTIVE),
+ RDS_IB_EVENT_STRING(PORT_ERR),
+ RDS_IB_EVENT_STRING(LID_CHANGE),
+ RDS_IB_EVENT_STRING(PKEY_CHANGE),
+ RDS_IB_EVENT_STRING(SM_CHANGE),
+ RDS_IB_EVENT_STRING(SRQ_ERR),
+ RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED),
+ RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED),
+ RDS_IB_EVENT_STRING(CLIENT_REREGISTER),
+#undef RDS_IB_EVENT_STRING
+};
+
+static char *rds_ib_event_str(enum ib_event_type type)
+{
+ return rds_str_array(rds_ib_event_type_strings,
+ ARRAY_SIZE(rds_ib_event_type_strings), type);
+};
+
/*
* Set the selected protocol version
*/
@@ -95,7 +125,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
{
const struct rds_ib_connect_private *dp = NULL;
struct rds_ib_connection *ic = conn->c_transport_data;
- struct rds_ib_device *rds_ibdev;
struct ib_qp_attr qp_attr;
int err;
@@ -111,11 +140,21 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
}
}
- printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
- &conn->c_faddr,
- RDS_PROTOCOL_MAJOR(conn->c_version),
- RDS_PROTOCOL_MINOR(conn->c_version),
- ic->i_flowctl ? ", flow control" : "");
+ if (conn->c_version < RDS_PROTOCOL(3,1)) {
+ printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
+ " no longer supported\n",
+ &conn->c_faddr,
+ RDS_PROTOCOL_MAJOR(conn->c_version),
+ RDS_PROTOCOL_MINOR(conn->c_version));
+ rds_conn_destroy(conn);
+ return;
+ } else {
+ printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
+ &conn->c_faddr,
+ RDS_PROTOCOL_MAJOR(conn->c_version),
+ RDS_PROTOCOL_MINOR(conn->c_version),
+ ic->i_flowctl ? ", flow control" : "");
+ }
/*
* Init rings and fill recv. this needs to wait until protocol negotiation
@@ -125,7 +164,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
rds_ib_recv_init_ring(ic);
/* Post receive buffers - as a side effect, this will update
* the posted credit count. */
- rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+ rds_ib_recv_refill(conn, 1);
/* Tune RNR behavior */
rds_ib_tune_rnr(ic, &qp_attr);
@@ -135,12 +174,11 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
if (err)
printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
- /* update ib_device with this local ipaddr & conn */
- rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
- err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
+ /* update ib_device with this local ipaddr */
+ err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
if (err)
- printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err);
- rds_ib_add_conn(rds_ibdev, conn);
+ printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
+ err);
/* If the peer gave us the last packet it saw, process this as if
* we had received a regular ACK. */
@@ -153,18 +191,23 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
struct rdma_conn_param *conn_param,
struct rds_ib_connect_private *dp,
- u32 protocol_version)
+ u32 protocol_version,
+ u32 max_responder_resources,
+ u32 max_initiator_depth)
{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
+
memset(conn_param, 0, sizeof(struct rdma_conn_param));
- /* XXX tune these? */
- conn_param->responder_resources = 1;
- conn_param->initiator_depth = 1;
+
+ conn_param->responder_resources =
+ min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
+ conn_param->initiator_depth =
+ min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
conn_param->rnr_retry_count = 7;
if (dp) {
- struct rds_ib_connection *ic = conn->c_transport_data;
-
memset(dp, 0, sizeof(*dp));
dp->dp_saddr = conn->c_laddr;
dp->dp_daddr = conn->c_faddr;
@@ -189,7 +232,8 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
{
- rdsdebug("event %u data %p\n", event->event, data);
+ rdsdebug("event %u (%s) data %p\n",
+ event->event, rds_ib_event_str(event->event), data);
}
static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
@@ -197,16 +241,18 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
struct rds_connection *conn = data;
struct rds_ib_connection *ic = conn->c_transport_data;
- rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
+ rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
+ rds_ib_event_str(event->event));
switch (event->event) {
case IB_EVENT_COMM_EST:
rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
break;
default:
- rdsdebug("Fatal QP Event %u "
+ rdsdebug("Fatal QP Event %u (%s) "
"- connection %pI4->%pI4, reconnecting\n",
- event->event, &conn->c_laddr, &conn->c_faddr);
+ event->event, rds_ib_event_str(event->event),
+ &conn->c_laddr, &conn->c_faddr);
rds_conn_drop(conn);
break;
}
@@ -224,18 +270,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
struct rds_ib_device *rds_ibdev;
int ret;
- /* rds_ib_add_one creates a rds_ib_device object per IB device,
- * and allocates a protection domain, memory range and FMR pool
- * for each. If that fails for any reason, it will not register
- * the rds_ibdev at all.
+ /*
+ * It's normal to see a null device if an incoming connection races
+ * with device removal, so we don't print a warning.
*/
- rds_ibdev = ib_get_client_data(dev, &rds_ib_client);
- if (rds_ibdev == NULL) {
- if (printk_ratelimit())
- printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
- dev->name);
+ rds_ibdev = rds_ib_get_client_data(dev);
+ if (!rds_ibdev)
return -EOPNOTSUPP;
- }
+
+ /* add the conn now so that connection establishment has the dev */
+ rds_ib_add_conn(rds_ibdev, conn);
if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
@@ -306,7 +350,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ic->i_send_ring.w_nr *
sizeof(struct rds_header),
&ic->i_send_hdrs_dma, GFP_KERNEL);
- if (ic->i_send_hdrs == NULL) {
+ if (!ic->i_send_hdrs) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent send failed\n");
goto out;
@@ -316,7 +360,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ic->i_recv_ring.w_nr *
sizeof(struct rds_header),
&ic->i_recv_hdrs_dma, GFP_KERNEL);
- if (ic->i_recv_hdrs == NULL) {
+ if (!ic->i_recv_hdrs) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent recv failed\n");
goto out;
@@ -324,22 +368,24 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
&ic->i_ack_dma, GFP_KERNEL);
- if (ic->i_ack == NULL) {
+ if (!ic->i_ack) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent ack failed\n");
goto out;
}
- ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
- if (ic->i_sends == NULL) {
+ ic->i_sends = vmalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
+ ibdev_to_node(dev));
+ if (!ic->i_sends) {
ret = -ENOMEM;
rdsdebug("send allocation failed\n");
goto out;
}
memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
- ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
- if (ic->i_recvs == NULL) {
+ ic->i_recvs = vmalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
+ ibdev_to_node(dev));
+ if (!ic->i_recvs) {
ret = -ENOMEM;
rdsdebug("recv allocation failed\n");
goto out;
@@ -352,6 +398,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ic->i_send_cq, ic->i_recv_cq);
out:
+ rds_ib_dev_put(rds_ibdev);
return ret;
}
@@ -409,7 +456,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
struct rds_ib_connection *ic = NULL;
struct rdma_conn_param conn_param;
u32 version;
- int err, destroy = 1;
+ int err = 1, destroy = 1;
/* Check whether the remote protocol version matches ours. */
version = rds_ib_protocol_compatible(event);
@@ -448,7 +495,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
/* Wait and see - our connect may still be succeeding */
rds_ib_stats_inc(s_ib_connect_raced);
}
- mutex_unlock(&conn->c_cm_lock);
goto out;
}
@@ -475,24 +521,23 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
err = rds_ib_setup_qp(conn);
if (err) {
rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
- mutex_unlock(&conn->c_cm_lock);
goto out;
}
- rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
+ rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
+ event->param.conn.responder_resources,
+ event->param.conn.initiator_depth);
/* rdma_accept() calls rdma_reject() internally if it fails */
err = rdma_accept(cm_id, &conn_param);
- mutex_unlock(&conn->c_cm_lock);
- if (err) {
+ if (err)
rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
- goto out;
- }
-
- return 0;
out:
- rdma_reject(cm_id, NULL, 0);
+ if (conn)
+ mutex_unlock(&conn->c_cm_lock);
+ if (err)
+ rdma_reject(cm_id, NULL, 0);
return destroy;
}
@@ -516,8 +561,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
goto out;
}
- rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
-
+ rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
+ UINT_MAX, UINT_MAX);
ret = rdma_connect(cm_id, &conn_param);
if (ret)
rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
@@ -601,9 +646,19 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
ic->i_cm_id, err);
}
+ /*
+ * We want to wait for tx and rx completion to finish
+ * before we tear down the connection, but we have to be
+ * careful not to get stuck waiting on a send ring that
+ * only has unsignaled sends in it. We've shutdown new
+ * sends before getting here so by waiting for signaled
+ * sends to complete we're ensured that there will be no
+ * more tx processing.
+ */
wait_event(rds_ib_ring_empty_wait,
- rds_ib_ring_empty(&ic->i_send_ring) &&
- rds_ib_ring_empty(&ic->i_recv_ring));
+ rds_ib_ring_empty(&ic->i_recv_ring) &&
+ (atomic_read(&ic->i_signaled_sends) == 0));
+ tasklet_kill(&ic->i_recv_tasklet);
if (ic->i_send_hdrs)
ib_dma_free_coherent(dev,
@@ -654,9 +709,12 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
BUG_ON(ic->rds_ibdev);
/* Clear pending transmit */
- if (ic->i_rm) {
- rds_message_put(ic->i_rm);
- ic->i_rm = NULL;
+ if (ic->i_data_op) {
+ struct rds_message *rm;
+
+ rm = container_of(ic->i_data_op, struct rds_message, data);
+ rds_message_put(rm);
+ ic->i_data_op = NULL;
}
/* Clear the ACK state */
@@ -690,12 +748,19 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
{
struct rds_ib_connection *ic;
unsigned long flags;
+ int ret;
/* XXX too lazy? */
ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
- if (ic == NULL)
+ if (!ic)
return -ENOMEM;
+ ret = rds_ib_recv_alloc_caches(ic);
+ if (ret) {
+ kfree(ic);
+ return ret;
+ }
+
INIT_LIST_HEAD(&ic->ib_node);
tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn,
(unsigned long) ic);
@@ -703,6 +768,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
#ifndef KERNEL_HAS_ATOMIC64
spin_lock_init(&ic->i_ack_lock);
#endif
+ atomic_set(&ic->i_signaled_sends, 0);
/*
* rds_ib_conn_shutdown() waits for these to be emptied so they
@@ -744,6 +810,8 @@ void rds_ib_conn_free(void *arg)
list_del(&ic->ib_node);
spin_unlock_irq(lock_ptr);
+ rds_ib_recv_free_caches(ic);
+
kfree(ic);
}
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index a54cd63f9e3..18a833c450c 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -32,11 +32,16 @@
*/
#include <linux/kernel.h>
#include <linux/slab.h>
+#include <linux/rculist.h>
#include "rds.h"
-#include "rdma.h"
#include "ib.h"
+#include "xlist.h"
+static struct workqueue_struct *rds_ib_fmr_wq;
+
+static DEFINE_PER_CPU(unsigned long, clean_list_grace);
+#define CLEAN_LIST_BUSY_BIT 0
/*
* This is stored as mr->r_trans_private.
@@ -45,7 +50,11 @@ struct rds_ib_mr {
struct rds_ib_device *device;
struct rds_ib_mr_pool *pool;
struct ib_fmr *fmr;
- struct list_head list;
+
+ struct xlist_head xlist;
+
+ /* unmap_list is for freeing */
+ struct list_head unmap_list;
unsigned int remap_count;
struct scatterlist *sg;
@@ -59,14 +68,16 @@ struct rds_ib_mr {
*/
struct rds_ib_mr_pool {
struct mutex flush_lock; /* serialize fmr invalidate */
- struct work_struct flush_worker; /* flush worker */
+ struct delayed_work flush_worker; /* flush worker */
- spinlock_t list_lock; /* protect variables below */
atomic_t item_count; /* total # of MRs */
atomic_t dirty_count; /* # dirty of MRs */
- struct list_head drop_list; /* MRs that have reached their max_maps limit */
- struct list_head free_list; /* unused MRs */
- struct list_head clean_list; /* unused & unamapped MRs */
+
+ struct xlist_head drop_list; /* MRs that have reached their max_maps limit */
+ struct xlist_head free_list; /* unused MRs */
+ struct xlist_head clean_list; /* global unused & unamapped MRs */
+ wait_queue_head_t flush_wait;
+
atomic_t free_pinned; /* memory pinned by free MRs */
unsigned long max_items;
unsigned long max_items_soft;
@@ -74,7 +85,7 @@ struct rds_ib_mr_pool {
struct ib_fmr_attr fmr_attr;
};
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all);
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
@@ -83,16 +94,17 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
struct rds_ib_device *rds_ibdev;
struct rds_ib_ipaddr *i_ipaddr;
- list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
- spin_lock_irq(&rds_ibdev->spinlock);
- list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
+ list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
if (i_ipaddr->ipaddr == ipaddr) {
- spin_unlock_irq(&rds_ibdev->spinlock);
+ atomic_inc(&rds_ibdev->refcount);
+ rcu_read_unlock();
return rds_ibdev;
}
}
- spin_unlock_irq(&rds_ibdev->spinlock);
}
+ rcu_read_unlock();
return NULL;
}
@@ -108,7 +120,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
i_ipaddr->ipaddr = ipaddr;
spin_lock_irq(&rds_ibdev->spinlock);
- list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
+ list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
spin_unlock_irq(&rds_ibdev->spinlock);
return 0;
@@ -116,17 +128,24 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
{
- struct rds_ib_ipaddr *i_ipaddr, *next;
+ struct rds_ib_ipaddr *i_ipaddr;
+ struct rds_ib_ipaddr *to_free = NULL;
+
spin_lock_irq(&rds_ibdev->spinlock);
- list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) {
+ list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
if (i_ipaddr->ipaddr == ipaddr) {
- list_del(&i_ipaddr->list);
- kfree(i_ipaddr);
+ list_del_rcu(&i_ipaddr->list);
+ to_free = i_ipaddr;
break;
}
}
spin_unlock_irq(&rds_ibdev->spinlock);
+
+ if (to_free) {
+ synchronize_rcu();
+ kfree(to_free);
+ }
}
int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
@@ -134,8 +153,10 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
struct rds_ib_device *rds_ibdev_old;
rds_ibdev_old = rds_ib_get_device(ipaddr);
- if (rds_ibdev_old)
+ if (rds_ibdev_old) {
rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
+ rds_ib_dev_put(rds_ibdev_old);
+ }
return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
}
@@ -150,12 +171,13 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con
BUG_ON(list_empty(&ic->ib_node));
list_del(&ic->ib_node);
- spin_lock_irq(&rds_ibdev->spinlock);
+ spin_lock(&rds_ibdev->spinlock);
list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
- spin_unlock_irq(&rds_ibdev->spinlock);
+ spin_unlock(&rds_ibdev->spinlock);
spin_unlock_irq(&ib_nodev_conns_lock);
ic->rds_ibdev = rds_ibdev;
+ atomic_inc(&rds_ibdev->refcount);
}
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
@@ -175,18 +197,18 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *
spin_unlock(&ib_nodev_conns_lock);
ic->rds_ibdev = NULL;
+ rds_ib_dev_put(rds_ibdev);
}
-void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock)
+void rds_ib_destroy_nodev_conns(void)
{
struct rds_ib_connection *ic, *_ic;
LIST_HEAD(tmp_list);
/* avoid calling conn_destroy with irqs off */
- spin_lock_irq(list_lock);
- list_splice(list, &tmp_list);
- INIT_LIST_HEAD(list);
- spin_unlock_irq(list_lock);
+ spin_lock_irq(&ib_nodev_conns_lock);
+ list_splice(&ib_nodev_conns, &tmp_list);
+ spin_unlock_irq(&ib_nodev_conns_lock);
list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
rds_conn_destroy(ic->conn);
@@ -200,12 +222,12 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
if (!pool)
return ERR_PTR(-ENOMEM);
- INIT_LIST_HEAD(&pool->free_list);
- INIT_LIST_HEAD(&pool->drop_list);
- INIT_LIST_HEAD(&pool->clean_list);
+ INIT_XLIST_HEAD(&pool->free_list);
+ INIT_XLIST_HEAD(&pool->drop_list);
+ INIT_XLIST_HEAD(&pool->clean_list);
mutex_init(&pool->flush_lock);
- spin_lock_init(&pool->list_lock);
- INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
+ init_waitqueue_head(&pool->flush_wait);
+ INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
pool->fmr_attr.max_pages = fmr_message_size;
pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
@@ -233,34 +255,60 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
{
- flush_workqueue(rds_wq);
- rds_ib_flush_mr_pool(pool, 1);
+ cancel_delayed_work_sync(&pool->flush_worker);
+ rds_ib_flush_mr_pool(pool, 1, NULL);
WARN_ON(atomic_read(&pool->item_count));
WARN_ON(atomic_read(&pool->free_pinned));
kfree(pool);
}
+static void refill_local(struct rds_ib_mr_pool *pool, struct xlist_head *xl,
+ struct rds_ib_mr **ibmr_ret)
+{
+ struct xlist_head *ibmr_xl;
+ ibmr_xl = xlist_del_head_fast(xl);
+ *ibmr_ret = list_entry(ibmr_xl, struct rds_ib_mr, xlist);
+}
+
static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
{
struct rds_ib_mr *ibmr = NULL;
- unsigned long flags;
+ struct xlist_head *ret;
+ unsigned long *flag;
- spin_lock_irqsave(&pool->list_lock, flags);
- if (!list_empty(&pool->clean_list)) {
- ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list);
- list_del_init(&ibmr->list);
- }
- spin_unlock_irqrestore(&pool->list_lock, flags);
+ preempt_disable();
+ flag = &__get_cpu_var(clean_list_grace);
+ set_bit(CLEAN_LIST_BUSY_BIT, flag);
+ ret = xlist_del_head(&pool->clean_list);
+ if (ret)
+ ibmr = list_entry(ret, struct rds_ib_mr, xlist);
+ clear_bit(CLEAN_LIST_BUSY_BIT, flag);
+ preempt_enable();
return ibmr;
}
+static inline void wait_clean_list_grace(void)
+{
+ int cpu;
+ unsigned long *flag;
+
+ for_each_online_cpu(cpu) {
+ flag = &per_cpu(clean_list_grace, cpu);
+ while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
+ cpu_relax();
+ }
+}
+
static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
{
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
struct rds_ib_mr *ibmr = NULL;
int err = 0, iter = 0;
+ if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+ queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
+
while (1) {
ibmr = rds_ib_reuse_fmr(pool);
if (ibmr)
@@ -287,19 +335,24 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
/* We do have some empty MRs. Flush them out. */
rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
- rds_ib_flush_mr_pool(pool, 0);
+ rds_ib_flush_mr_pool(pool, 0, &ibmr);
+ if (ibmr)
+ return ibmr;
}
- ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
+ ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
if (!ibmr) {
err = -ENOMEM;
goto out_no_cigar;
}
+ memset(ibmr, 0, sizeof(*ibmr));
+
ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
(IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_READ |
- IB_ACCESS_REMOTE_WRITE),
+ IB_ACCESS_REMOTE_WRITE|
+ IB_ACCESS_REMOTE_ATOMIC),
&pool->fmr_attr);
if (IS_ERR(ibmr->fmr)) {
err = PTR_ERR(ibmr->fmr);
@@ -367,7 +420,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
if (page_cnt > fmr_message_size)
return -EINVAL;
- dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC);
+ dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
+ rdsibdev_to_node(rds_ibdev));
if (!dma_pages)
return -ENOMEM;
@@ -441,7 +495,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
/* FIXME we need a way to tell a r/w MR
* from a r/o MR */
- BUG_ON(in_interrupt());
+ BUG_ON(irqs_disabled());
set_page_dirty(page);
put_page(page);
}
@@ -477,33 +531,109 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr
}
/*
+ * given an xlist of mrs, put them all into the list_head for more processing
+ */
+static void xlist_append_to_list(struct xlist_head *xlist, struct list_head *list)
+{
+ struct rds_ib_mr *ibmr;
+ struct xlist_head splice;
+ struct xlist_head *cur;
+ struct xlist_head *next;
+
+ splice.next = NULL;
+ xlist_splice(xlist, &splice);
+ cur = splice.next;
+ while (cur) {
+ next = cur->next;
+ ibmr = list_entry(cur, struct rds_ib_mr, xlist);
+ list_add_tail(&ibmr->unmap_list, list);
+ cur = next;
+ }
+}
+
+/*
+ * this takes a list head of mrs and turns it into an xlist of clusters.
+ * each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for
+ * reuse.
+ */
+static void list_append_to_xlist(struct rds_ib_mr_pool *pool,
+ struct list_head *list, struct xlist_head *xlist,
+ struct xlist_head **tail_ret)
+{
+ struct rds_ib_mr *ibmr;
+ struct xlist_head *cur_mr = xlist;
+ struct xlist_head *tail_mr = NULL;
+
+ list_for_each_entry(ibmr, list, unmap_list) {
+ tail_mr = &ibmr->xlist;
+ tail_mr->next = NULL;
+ cur_mr->next = tail_mr;
+ cur_mr = tail_mr;
+ }
+ *tail_ret = tail_mr;
+}
+
+/*
* Flush our pool of MRs.
* At a minimum, all currently unused MRs are unmapped.
* If the number of MRs allocated exceeds the limit, we also try
* to free as many MRs as needed to get back to this limit.
*/
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
+ int free_all, struct rds_ib_mr **ibmr_ret)
{
struct rds_ib_mr *ibmr, *next;
+ struct xlist_head clean_xlist;
+ struct xlist_head *clean_tail;
LIST_HEAD(unmap_list);
LIST_HEAD(fmr_list);
unsigned long unpinned = 0;
- unsigned long flags;
unsigned int nfreed = 0, ncleaned = 0, free_goal;
int ret = 0;
rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
- mutex_lock(&pool->flush_lock);
+ if (ibmr_ret) {
+ DEFINE_WAIT(wait);
+ while(!mutex_trylock(&pool->flush_lock)) {
+ ibmr = rds_ib_reuse_fmr(pool);
+ if (ibmr) {
+ *ibmr_ret = ibmr;
+ finish_wait(&pool->flush_wait, &wait);
+ goto out_nolock;
+ }
+
+ prepare_to_wait(&pool->flush_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (xlist_empty(&pool->clean_list))
+ schedule();
+
+ ibmr = rds_ib_reuse_fmr(pool);
+ if (ibmr) {
+ *ibmr_ret = ibmr;
+ finish_wait(&pool->flush_wait, &wait);
+ goto out_nolock;
+ }
+ }
+ finish_wait(&pool->flush_wait, &wait);
+ } else
+ mutex_lock(&pool->flush_lock);
+
+ if (ibmr_ret) {
+ ibmr = rds_ib_reuse_fmr(pool);
+ if (ibmr) {
+ *ibmr_ret = ibmr;
+ goto out;
+ }
+ }
- spin_lock_irqsave(&pool->list_lock, flags);
/* Get the list of all MRs to be dropped. Ordering matters -
- * we want to put drop_list ahead of free_list. */
- list_splice_init(&pool->free_list, &unmap_list);
- list_splice_init(&pool->drop_list, &unmap_list);
+ * we want to put drop_list ahead of free_list.
+ */
+ xlist_append_to_list(&pool->drop_list, &unmap_list);
+ xlist_append_to_list(&pool->free_list, &unmap_list);
if (free_all)
- list_splice_init(&pool->clean_list, &unmap_list);
- spin_unlock_irqrestore(&pool->list_lock, flags);
+ xlist_append_to_list(&pool->clean_list, &unmap_list);
free_goal = rds_ib_flush_goal(pool, free_all);
@@ -511,19 +641,20 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
goto out;
/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
- list_for_each_entry(ibmr, &unmap_list, list)
+ list_for_each_entry(ibmr, &unmap_list, unmap_list)
list_add(&ibmr->fmr->list, &fmr_list);
+
ret = ib_unmap_fmr(&fmr_list);
if (ret)
printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
/* Now we can destroy the DMA mapping and unpin any pages */
- list_for_each_entry_safe(ibmr, next, &unmap_list, list) {
+ list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
unpinned += ibmr->sg_len;
__rds_ib_teardown_mr(ibmr);
if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
rds_ib_stats_inc(s_ib_rdma_mr_free);
- list_del(&ibmr->list);
+ list_del(&ibmr->unmap_list);
ib_dealloc_fmr(ibmr->fmr);
kfree(ibmr);
nfreed++;
@@ -531,9 +662,27 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
ncleaned++;
}
- spin_lock_irqsave(&pool->list_lock, flags);
- list_splice(&unmap_list, &pool->clean_list);
- spin_unlock_irqrestore(&pool->list_lock, flags);
+ if (!list_empty(&unmap_list)) {
+ /* we have to make sure that none of the things we're about
+ * to put on the clean list would race with other cpus trying
+ * to pull items off. The xlist would explode if we managed to
+ * remove something from the clean list and then add it back again
+ * while another CPU was spinning on that same item in xlist_del_head.
+ *
+ * This is pretty unlikely, but just in case wait for an xlist grace period
+ * here before adding anything back into the clean list.
+ */
+ wait_clean_list_grace();
+
+ list_append_to_xlist(pool, &unmap_list, &clean_xlist, &clean_tail);
+ if (ibmr_ret)
+ refill_local(pool, &clean_xlist, ibmr_ret);
+
+ /* refill_local may have emptied our list */
+ if (!xlist_empty(&clean_xlist))
+ xlist_add(clean_xlist.next, clean_tail, &pool->clean_list);
+
+ }
atomic_sub(unpinned, &pool->free_pinned);
atomic_sub(ncleaned, &pool->dirty_count);
@@ -541,14 +690,35 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
out:
mutex_unlock(&pool->flush_lock);
+ if (waitqueue_active(&pool->flush_wait))
+ wake_up(&pool->flush_wait);
+out_nolock:
return ret;
}
+int rds_ib_fmr_init(void)
+{
+ rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd");
+ if (!rds_ib_fmr_wq)
+ return -ENOMEM;
+ return 0;
+}
+
+/*
+ * By the time this is called all the IB devices should have been torn down and
+ * had their pools freed. As each pool is freed its work struct is waited on,
+ * so the pool flushing work queue should be idle by the time we get here.
+ */
+void rds_ib_fmr_exit(void)
+{
+ destroy_workqueue(rds_ib_fmr_wq);
+}
+
static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
{
- struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker);
+ struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
- rds_ib_flush_mr_pool(pool, 0);
+ rds_ib_flush_mr_pool(pool, 0, NULL);
}
void rds_ib_free_mr(void *trans_private, int invalidate)
@@ -556,47 +726,49 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
struct rds_ib_mr *ibmr = trans_private;
struct rds_ib_device *rds_ibdev = ibmr->device;
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
- unsigned long flags;
rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
/* Return it to the pool's free list */
- spin_lock_irqsave(&pool->list_lock, flags);
if (ibmr->remap_count >= pool->fmr_attr.max_maps)
- list_add(&ibmr->list, &pool->drop_list);
+ xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list);
else
- list_add(&ibmr->list, &pool->free_list);
+ xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->free_list);
atomic_add(ibmr->sg_len, &pool->free_pinned);
atomic_inc(&pool->dirty_count);
- spin_unlock_irqrestore(&pool->list_lock, flags);
/* If we've pinned too many pages, request a flush */
if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
atomic_read(&pool->dirty_count) >= pool->max_items / 10)
- queue_work(rds_wq, &pool->flush_worker);
+ queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
if (invalidate) {
if (likely(!in_interrupt())) {
- rds_ib_flush_mr_pool(pool, 0);
+ rds_ib_flush_mr_pool(pool, 0, NULL);
} else {
/* We get here if the user created a MR marked
* as use_once and invalidate at the same time. */
- queue_work(rds_wq, &pool->flush_worker);
+ queue_delayed_work(rds_ib_fmr_wq,
+ &pool->flush_worker, 10);
}
}
+
+ rds_ib_dev_put(rds_ibdev);
}
void rds_ib_flush_mrs(void)
{
struct rds_ib_device *rds_ibdev;
+ down_read(&rds_ib_devices_lock);
list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
if (pool)
- rds_ib_flush_mr_pool(pool, 0);
+ rds_ib_flush_mr_pool(pool, 0, NULL);
}
+ up_read(&rds_ib_devices_lock);
}
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
@@ -628,6 +800,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
ibmr->device = rds_ibdev;
+ rds_ibdev = NULL;
out:
if (ret) {
@@ -635,5 +808,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
rds_ib_free_mr(ibmr, 0);
ibmr = ERR_PTR(ret);
}
+ if (rds_ibdev)
+ rds_ib_dev_put(rds_ibdev);
return ibmr;
}
+
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index c74e9904a6b..e29e0ca32f7 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -43,42 +43,6 @@ static struct kmem_cache *rds_ib_incoming_slab;
static struct kmem_cache *rds_ib_frag_slab;
static atomic_t rds_ib_allocation = ATOMIC_INIT(0);
-static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
-{
- rdsdebug("frag %p page %p\n", frag, frag->f_page);
- __free_page(frag->f_page);
- frag->f_page = NULL;
-}
-
-static void rds_ib_frag_free(struct rds_page_frag *frag)
-{
- rdsdebug("frag %p page %p\n", frag, frag->f_page);
- BUG_ON(frag->f_page != NULL);
- kmem_cache_free(rds_ib_frag_slab, frag);
-}
-
-/*
- * We map a page at a time. Its fragments are posted in order. This
- * is called in fragment order as the fragments get send completion events.
- * Only the last frag in the page performs the unmapping.
- *
- * It's OK for ring cleanup to call this in whatever order it likes because
- * DMA is not in flight and so we can unmap while other ring entries still
- * hold page references in their frags.
- */
-static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
- struct rds_ib_recv_work *recv)
-{
- struct rds_page_frag *frag = recv->r_frag;
-
- rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
- if (frag->f_mapped)
- ib_dma_unmap_page(ic->i_cm_id->device,
- frag->f_mapped,
- RDS_FRAG_SIZE, DMA_FROM_DEVICE);
- frag->f_mapped = 0;
-}
-
void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
{
struct rds_ib_recv_work *recv;
@@ -95,16 +59,161 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
recv->r_wr.sg_list = recv->r_sge;
recv->r_wr.num_sge = RDS_IB_RECV_SGE;
- sge = rds_ib_data_sge(ic, recv->r_sge);
+ sge = &recv->r_sge[0];
+ sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = ic->i_mr->lkey;
+
+ sge = &recv->r_sge[1];
sge->addr = 0;
sge->length = RDS_FRAG_SIZE;
sge->lkey = ic->i_mr->lkey;
+ }
+}
- sge = rds_ib_header_sge(ic, recv->r_sge);
- sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
- sge->length = sizeof(struct rds_header);
- sge->lkey = ic->i_mr->lkey;
+/*
+ * The entire 'from' list, including the from element itself, is put on
+ * to the tail of the 'to' list.
+ */
+static void list_splice_entire_tail(struct list_head *from,
+ struct list_head *to)
+{
+ struct list_head *from_last = from->prev;
+
+ list_splice_tail(from_last, to);
+ list_add_tail(from_last, to);
+}
+
+static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
+{
+ struct list_head *tmp;
+
+ tmp = xchg(&cache->xfer, NULL);
+ if (tmp) {
+ if (cache->ready)
+ list_splice_entire_tail(tmp, cache->ready);
+ else
+ cache->ready = tmp;
+ }
+}
+
+static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
+{
+ struct rds_ib_cache_head *head;
+ int cpu;
+
+ cache->percpu = alloc_percpu(struct rds_ib_cache_head);
+ if (!cache->percpu)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ head = per_cpu_ptr(cache->percpu, cpu);
+ head->first = NULL;
+ head->count = 0;
+ }
+ cache->xfer = NULL;
+ cache->ready = NULL;
+
+ return 0;
+}
+
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
+{
+ int ret;
+
+ ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
+ if (!ret) {
+ ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
+ if (ret)
+ free_percpu(ic->i_cache_incs.percpu);
}
+
+ return ret;
+}
+
+static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
+ struct list_head *caller_list)
+{
+ struct rds_ib_cache_head *head;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ head = per_cpu_ptr(cache->percpu, cpu);
+ if (head->first) {
+ list_splice_entire_tail(head->first, caller_list);
+ head->first = NULL;
+ }
+ }
+
+ if (cache->ready) {
+ list_splice_entire_tail(cache->ready, caller_list);
+ cache->ready = NULL;
+ }
+}
+
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
+{
+ struct rds_ib_incoming *inc;
+ struct rds_ib_incoming *inc_tmp;
+ struct rds_page_frag *frag;
+ struct rds_page_frag *frag_tmp;
+ LIST_HEAD(list);
+
+ rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+ rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
+ free_percpu(ic->i_cache_incs.percpu);
+
+ list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
+ list_del(&inc->ii_cache_entry);
+ WARN_ON(!list_empty(&inc->ii_frags));
+ kmem_cache_free(rds_ib_incoming_slab, inc);
+ }
+
+ rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+ rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
+ free_percpu(ic->i_cache_frags.percpu);
+
+ list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
+ list_del(&frag->f_cache_entry);
+ WARN_ON(!list_empty(&frag->f_item));
+ kmem_cache_free(rds_ib_frag_slab, frag);
+ }
+}
+
+/* fwd decl */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+ struct rds_ib_refill_cache *cache);
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
+
+
+/* Recycle frag and attached recv buffer f_sg */
+static void rds_ib_frag_free(struct rds_ib_connection *ic,
+ struct rds_page_frag *frag)
+{
+ rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
+
+ rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
+}
+
+/* Recycle inc after freeing attached frags */
+void rds_ib_inc_free(struct rds_incoming *inc)
+{
+ struct rds_ib_incoming *ibinc;
+ struct rds_page_frag *frag;
+ struct rds_page_frag *pos;
+ struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
+
+ ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+
+ /* Free attached frags */
+ list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
+ list_del_init(&frag->f_item);
+ rds_ib_frag_free(ic, frag);
+ }
+ BUG_ON(!list_empty(&ibinc->ii_frags));
+
+ rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
+ rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
}
static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
@@ -115,10 +224,8 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
recv->r_ibinc = NULL;
}
if (recv->r_frag) {
- rds_ib_recv_unmap_page(ic, recv);
- if (recv->r_frag->f_page)
- rds_ib_frag_drop_page(recv->r_frag);
- rds_ib_frag_free(recv->r_frag);
+ ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
+ rds_ib_frag_free(ic, recv->r_frag);
recv->r_frag = NULL;
}
}
@@ -129,84 +236,111 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
for (i = 0; i < ic->i_recv_ring.w_nr; i++)
rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
-
- if (ic->i_frag.f_page)
- rds_ib_frag_drop_page(&ic->i_frag);
}
-static int rds_ib_recv_refill_one(struct rds_connection *conn,
- struct rds_ib_recv_work *recv,
- gfp_t kptr_gfp, gfp_t page_gfp)
+static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
+ gfp_t slab_mask)
{
- struct rds_ib_connection *ic = conn->c_transport_data;
- dma_addr_t dma_addr;
- struct ib_sge *sge;
- int ret = -ENOMEM;
+ struct rds_ib_incoming *ibinc;
+ struct list_head *cache_item;
+ int avail_allocs;
- if (recv->r_ibinc == NULL) {
- if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) {
+ cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
+ if (cache_item) {
+ ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
+ } else {
+ avail_allocs = atomic_add_unless(&rds_ib_allocation,
+ 1, rds_ib_sysctl_max_recv_allocation);
+ if (!avail_allocs) {
rds_ib_stats_inc(s_ib_rx_alloc_limit);
- goto out;
+ return NULL;
}
- recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab,
- kptr_gfp);
- if (recv->r_ibinc == NULL) {
+ ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
+ if (!ibinc) {
atomic_dec(&rds_ib_allocation);
- goto out;
+ return NULL;
}
- INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
- rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
}
+ INIT_LIST_HEAD(&ibinc->ii_frags);
+ rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
- if (recv->r_frag == NULL) {
- recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp);
- if (recv->r_frag == NULL)
- goto out;
- INIT_LIST_HEAD(&recv->r_frag->f_item);
- recv->r_frag->f_page = NULL;
+ return ibinc;
+}
+
+static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
+ gfp_t slab_mask, gfp_t page_mask)
+{
+ struct rds_page_frag *frag;
+ struct list_head *cache_item;
+ int ret;
+
+ cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
+ if (cache_item) {
+ frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
+ } else {
+ frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
+ if (!frag)
+ return NULL;
+
+ sg_init_table(&frag->f_sg, 1);
+ ret = rds_page_remainder_alloc(&frag->f_sg,
+ RDS_FRAG_SIZE, page_mask);
+ if (ret) {
+ kmem_cache_free(rds_ib_frag_slab, frag);
+ return NULL;
+ }
}
- if (ic->i_frag.f_page == NULL) {
- ic->i_frag.f_page = alloc_page(page_gfp);
- if (ic->i_frag.f_page == NULL)
- goto out;
- ic->i_frag.f_offset = 0;
+ INIT_LIST_HEAD(&frag->f_item);
+
+ return frag;
+}
+
+static int rds_ib_recv_refill_one(struct rds_connection *conn,
+ struct rds_ib_recv_work *recv, int prefill)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct ib_sge *sge;
+ int ret = -ENOMEM;
+ gfp_t slab_mask = GFP_NOWAIT;
+ gfp_t page_mask = GFP_NOWAIT;
+
+ if (prefill) {
+ slab_mask = GFP_KERNEL;
+ page_mask = GFP_HIGHUSER;
}
- dma_addr = ib_dma_map_page(ic->i_cm_id->device,
- ic->i_frag.f_page,
- ic->i_frag.f_offset,
- RDS_FRAG_SIZE,
- DMA_FROM_DEVICE);
- if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
- goto out;
+ if (!ic->i_cache_incs.ready)
+ rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+ if (!ic->i_cache_frags.ready)
+ rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
/*
- * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap()
- * must be called on this recv. This happens as completions hit
- * in order or on connection shutdown.
+ * ibinc was taken from recv if recv contained the start of a message.
+ * recvs that were continuations will still have this allocated.
*/
- recv->r_frag->f_page = ic->i_frag.f_page;
- recv->r_frag->f_offset = ic->i_frag.f_offset;
- recv->r_frag->f_mapped = dma_addr;
+ if (!recv->r_ibinc) {
+ recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
+ if (!recv->r_ibinc)
+ goto out;
+ }
- sge = rds_ib_data_sge(ic, recv->r_sge);
- sge->addr = dma_addr;
- sge->length = RDS_FRAG_SIZE;
+ WARN_ON(recv->r_frag); /* leak! */
+ recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
+ if (!recv->r_frag)
+ goto out;
+
+ ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
+ 1, DMA_FROM_DEVICE);
+ WARN_ON(ret != 1);
- sge = rds_ib_header_sge(ic, recv->r_sge);
+ sge = &recv->r_sge[0];
sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
sge->length = sizeof(struct rds_header);
- get_page(recv->r_frag->f_page);
-
- if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
- ic->i_frag.f_offset += RDS_FRAG_SIZE;
- } else {
- put_page(ic->i_frag.f_page);
- ic->i_frag.f_page = NULL;
- ic->i_frag.f_offset = 0;
- }
+ sge = &recv->r_sge[1];
+ sge->addr = sg_dma_address(&recv->r_frag->f_sg);
+ sge->length = sg_dma_len(&recv->r_frag->f_sg);
ret = 0;
out:
@@ -216,13 +350,11 @@ out:
/*
* This tries to allocate and post unused work requests after making sure that
* they have all the allocations they need to queue received fragments into
- * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
- * pairs don't go unmatched.
+ * sockets.
*
* -1 is returned if posting fails due to temporary resource exhaustion.
*/
-int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
- gfp_t page_gfp, int prefill)
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
{
struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_recv_work *recv;
@@ -236,28 +368,25 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
if (pos >= ic->i_recv_ring.w_nr) {
printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
pos);
- ret = -EINVAL;
break;
}
recv = &ic->i_recvs[pos];
- ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
+ ret = rds_ib_recv_refill_one(conn, recv, prefill);
if (ret) {
- ret = -1;
break;
}
/* XXX when can this fail? */
ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
- recv->r_ibinc, recv->r_frag->f_page,
- (long) recv->r_frag->f_mapped, ret);
+ recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
+ (long) sg_dma_address(&recv->r_frag->f_sg), ret);
if (ret) {
rds_ib_conn_error(conn, "recv post on "
"%pI4 returned %d, disconnecting and "
"reconnecting\n", &conn->c_faddr,
ret);
- ret = -1;
break;
}
@@ -270,37 +399,73 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
if (ret)
rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
- return ret;
}
-void rds_ib_inc_purge(struct rds_incoming *inc)
+/*
+ * We want to recycle several types of recv allocations, like incs and frags.
+ * To use this, the *_free() function passes in the ptr to a list_head within
+ * the recyclee, as well as the cache to put it on.
+ *
+ * First, we put the memory on a percpu list. When this reaches a certain size,
+ * We move it to an intermediate non-percpu list in a lockless manner, with some
+ * xchg/compxchg wizardry.
+ *
+ * N.B. Instead of a list_head as the anchor, we use a single pointer, which can
+ * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
+ * list_empty() will return true with one element is actually present.
+ */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+ struct rds_ib_refill_cache *cache)
{
- struct rds_ib_incoming *ibinc;
- struct rds_page_frag *frag;
- struct rds_page_frag *pos;
+ unsigned long flags;
+ struct rds_ib_cache_head *chp;
+ struct list_head *old;
- ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
- rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);
+ local_irq_save(flags);
- list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
- list_del_init(&frag->f_item);
- rds_ib_frag_drop_page(frag);
- rds_ib_frag_free(frag);
- }
+ chp = per_cpu_ptr(cache->percpu, smp_processor_id());
+ if (!chp->first)
+ INIT_LIST_HEAD(new_item);
+ else /* put on front */
+ list_add_tail(new_item, chp->first);
+ chp->first = new_item;
+ chp->count++;
+
+ if (chp->count < RDS_IB_RECYCLE_BATCH_COUNT)
+ goto end;
+
+ /*
+ * Return our per-cpu first list to the cache's xfer by atomically
+ * grabbing the current xfer list, appending it to our per-cpu list,
+ * and then atomically returning that entire list back to the
+ * cache's xfer list as long as it's still empty.
+ */
+ do {
+ old = xchg(&cache->xfer, NULL);
+ if (old)
+ list_splice_entire_tail(old, chp->first);
+ old = cmpxchg(&cache->xfer, NULL, chp->first);
+ } while (old);
+
+ chp->first = NULL;
+ chp->count = 0;
+end:
+ local_irq_restore(flags);
}
-void rds_ib_inc_free(struct rds_incoming *inc)
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
{
- struct rds_ib_incoming *ibinc;
-
- ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+ struct list_head *head = cache->ready;
+
+ if (head) {
+ if (!list_empty(head)) {
+ cache->ready = head->next;
+ list_del_init(head);
+ } else
+ cache->ready = NULL;
+ }
- rds_ib_inc_purge(inc);
- rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
- BUG_ON(!list_empty(&ibinc->ii_frags));
- kmem_cache_free(rds_ib_incoming_slab, ibinc);
- atomic_dec(&rds_ib_allocation);
- BUG_ON(atomic_read(&rds_ib_allocation) < 0);
+ return head;
}
int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
@@ -336,13 +501,13 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
to_copy = min_t(unsigned long, to_copy, len - copied);
rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
- "[%p, %lu] + %lu\n",
+ "[%p, %u] + %lu\n",
to_copy, iov->iov_base, iov->iov_len, iov_off,
- frag->f_page, frag->f_offset, frag_off);
+ sg_page(&frag->f_sg), frag->f_sg.offset, frag_off);
/* XXX needs + offset for multiple recvs per page */
- ret = rds_page_copy_to_user(frag->f_page,
- frag->f_offset + frag_off,
+ ret = rds_page_copy_to_user(sg_page(&frag->f_sg),
+ frag->f_sg.offset + frag_off,
iov->iov_base + iov_off,
to_copy);
if (ret) {
@@ -557,47 +722,6 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
return rds_ib_get_ack(ic);
}
-static struct rds_header *rds_ib_get_header(struct rds_connection *conn,
- struct rds_ib_recv_work *recv,
- u32 data_len)
-{
- struct rds_ib_connection *ic = conn->c_transport_data;
- void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs];
- void *addr;
- u32 misplaced_hdr_bytes;
-
- /*
- * Support header at the front (RDS 3.1+) as well as header-at-end.
- *
- * Cases:
- * 1) header all in header buff (great!)
- * 2) header all in data page (copy all to header buff)
- * 3) header split across hdr buf + data page
- * (move bit in hdr buff to end before copying other bit from data page)
- */
- if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE)
- return hdr_buff;
-
- if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) {
- addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
- memcpy(hdr_buff,
- addr + recv->r_frag->f_offset + data_len,
- sizeof(struct rds_header));
- kunmap_atomic(addr, KM_SOFTIRQ0);
- return hdr_buff;
- }
-
- misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len));
-
- memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes);
-
- addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
- memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len,
- sizeof(struct rds_header) - misplaced_hdr_bytes);
- kunmap_atomic(addr, KM_SOFTIRQ0);
- return hdr_buff;
-}
-
/*
* It's kind of lame that we're copying from the posted receive pages into
* long-lived bitmaps. We could have posted the bitmaps and rdma written into
@@ -639,7 +763,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
- addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
+ addr = kmap_atomic(sg_page(&frag->f_sg), KM_SOFTIRQ0);
src = addr + frag_off;
dst = (void *)map->m_page_addrs[map_page] + map_off;
@@ -710,7 +834,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
}
data_len -= sizeof(struct rds_header);
- ihdr = rds_ib_get_header(conn, recv, data_len);
+ ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
/* Validate the checksum. */
if (!rds_message_verify_checksum(ihdr)) {
@@ -742,12 +866,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
* the inc is freed. We don't go that route, so we have to drop the
* page ref ourselves. We can't just leave the page on the recv
* because that confuses the dma mapping of pages and each recv's use
- * of a partial page. We can leave the frag, though, it will be
- * reused.
+ * of a partial page.
*
* FIXME: Fold this into the code path below.
*/
- rds_ib_frag_drop_page(recv->r_frag);
+ rds_ib_frag_free(ic, recv->r_frag);
+ recv->r_frag = NULL;
return;
}
@@ -757,7 +881,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
* into the inc and save the inc so we can hang upcoming fragments
* off its list.
*/
- if (ibinc == NULL) {
+ if (!ibinc) {
ibinc = recv->r_ibinc;
recv->r_ibinc = NULL;
ic->i_ibinc = ibinc;
@@ -842,32 +966,38 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic,
struct rds_ib_recv_work *recv;
while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
- rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
- (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+ rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+ (unsigned long long)wc.wr_id, wc.status,
+ rds_ib_wc_status_str(wc.status), wc.byte_len,
be32_to_cpu(wc.ex.imm_data));
rds_ib_stats_inc(s_ib_rx_cq_event);
recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
- rds_ib_recv_unmap_page(ic, recv);
+ ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
/*
* Also process recvs in connecting state because it is possible
* to get a recv completion _before_ the rdmacm ESTABLISHED
* event is processed.
*/
- if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
+ if (wc.status == IB_WC_SUCCESS) {
+ rds_ib_process_recv(conn, recv, wc.byte_len, state);
+ } else {
/* We expect errors as the qp is drained during shutdown */
- if (wc.status == IB_WC_SUCCESS) {
- rds_ib_process_recv(conn, recv, wc.byte_len, state);
- } else {
- rds_ib_conn_error(conn, "recv completion on "
- "%pI4 had status %u, disconnecting and "
- "reconnecting\n", &conn->c_faddr,
- wc.status);
- }
+ if (rds_conn_up(conn) || rds_conn_connecting(conn))
+ rds_ib_conn_error(conn, "recv completion on %pI4 had "
+ "status %u (%s), disconnecting and "
+ "reconnecting\n", &conn->c_faddr,
+ wc.status,
+ rds_ib_wc_status_str(wc.status));
}
+ /*
+ * It's very important that we only free this ring entry if we've truly
+ * freed the resources allocated to the entry. The refilling path can
+ * leak if we don't.
+ */
rds_ib_ring_free(&ic->i_recv_ring, 1);
}
}
@@ -897,11 +1027,8 @@ void rds_ib_recv_tasklet_fn(unsigned long data)
if (rds_ib_ring_empty(&ic->i_recv_ring))
rds_ib_stats_inc(s_ib_rx_ring_empty);
- /*
- * If the ring is running low, then schedule the thread to refill.
- */
if (rds_ib_ring_low(&ic->i_recv_ring))
- queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+ rds_ib_recv_refill(conn, 0);
}
int rds_ib_recv(struct rds_connection *conn)
@@ -910,25 +1037,13 @@ int rds_ib_recv(struct rds_connection *conn)
int ret = 0;
rdsdebug("conn %p\n", conn);
-
- /*
- * If we get a temporary posting failure in this context then
- * we're really low and we want the caller to back off for a bit.
- */
- mutex_lock(&ic->i_recv_mutex);
- if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
- ret = -ENOMEM;
- else
- rds_ib_stats_inc(s_ib_rx_refill_from_thread);
- mutex_unlock(&ic->i_recv_mutex);
-
if (rds_conn_up(conn))
rds_ib_attempt_ack(ic);
return ret;
}
-int __init rds_ib_recv_init(void)
+int rds_ib_recv_init(void)
{
struct sysinfo si;
int ret = -ENOMEM;
@@ -939,14 +1054,14 @@ int __init rds_ib_recv_init(void)
rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
sizeof(struct rds_ib_incoming),
- 0, 0, NULL);
- if (rds_ib_incoming_slab == NULL)
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!rds_ib_incoming_slab)
goto out;
rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
sizeof(struct rds_page_frag),
- 0, 0, NULL);
- if (rds_ib_frag_slab == NULL)
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!rds_ib_frag_slab)
kmem_cache_destroy(rds_ib_incoming_slab);
else
ret = 0;
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 17fa80803ab..71f373c421b 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -36,11 +36,49 @@
#include <linux/dmapool.h>
#include "rds.h"
-#include "rdma.h"
#include "ib.h"
-static void rds_ib_send_rdma_complete(struct rds_message *rm,
- int wc_status)
+static char *rds_ib_wc_status_strings[] = {
+#define RDS_IB_WC_STATUS_STR(foo) \
+ [IB_WC_##foo] = __stringify(IB_WC_##foo)
+ RDS_IB_WC_STATUS_STR(SUCCESS),
+ RDS_IB_WC_STATUS_STR(LOC_LEN_ERR),
+ RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR),
+ RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR),
+ RDS_IB_WC_STATUS_STR(LOC_PROT_ERR),
+ RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR),
+ RDS_IB_WC_STATUS_STR(MW_BIND_ERR),
+ RDS_IB_WC_STATUS_STR(BAD_RESP_ERR),
+ RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR),
+ RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR),
+ RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR),
+ RDS_IB_WC_STATUS_STR(REM_OP_ERR),
+ RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR),
+ RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR),
+ RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR),
+ RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR),
+ RDS_IB_WC_STATUS_STR(REM_ABORT_ERR),
+ RDS_IB_WC_STATUS_STR(INV_EECN_ERR),
+ RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR),
+ RDS_IB_WC_STATUS_STR(FATAL_ERR),
+ RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR),
+ RDS_IB_WC_STATUS_STR(GENERAL_ERR),
+#undef RDS_IB_WC_STATUS_STR
+};
+
+char *rds_ib_wc_status_str(enum ib_wc_status status)
+{
+ return rds_str_array(rds_ib_wc_status_strings,
+ ARRAY_SIZE(rds_ib_wc_status_strings), status);
+}
+
+/*
+ * Convert IB-specific error message to RDS error message and call core
+ * completion handler.
+ */
+static void rds_ib_send_complete(struct rds_message *rm,
+ int wc_status,
+ void (*complete)(struct rds_message *rm, int status))
{
int notify_status;
@@ -60,69 +98,125 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm,
notify_status = RDS_RDMA_OTHER_ERROR;
break;
}
- rds_rdma_send_complete(rm, notify_status);
+ complete(rm, notify_status);
+}
+
+static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
+ struct rm_data_op *op,
+ int wc_status)
+{
+ if (op->op_nents)
+ ib_dma_unmap_sg(ic->i_cm_id->device,
+ op->op_sg, op->op_nents,
+ DMA_TO_DEVICE);
}
static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
- struct rds_rdma_op *op)
+ struct rm_rdma_op *op,
+ int wc_status)
{
- if (op->r_mapped) {
+ if (op->op_mapped) {
ib_dma_unmap_sg(ic->i_cm_id->device,
- op->r_sg, op->r_nents,
- op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
- op->r_mapped = 0;
+ op->op_sg, op->op_nents,
+ op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ op->op_mapped = 0;
}
+
+ /* If the user asked for a completion notification on this
+ * message, we can implement three different semantics:
+ * 1. Notify when we received the ACK on the RDS message
+ * that was queued with the RDMA. This provides reliable
+ * notification of RDMA status at the expense of a one-way
+ * packet delay.
+ * 2. Notify when the IB stack gives us the completion event for
+ * the RDMA operation.
+ * 3. Notify when the IB stack gives us the completion event for
+ * the accompanying RDS messages.
+ * Here, we implement approach #3. To implement approach #2,
+ * we would need to take an event for the rdma WR. To implement #1,
+ * don't call rds_rdma_send_complete at all, and fall back to the notify
+ * handling in the ACK processing code.
+ *
+ * Note: There's no need to explicitly sync any RDMA buffers using
+ * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+ * operation itself unmapped the RDMA buffers, which takes care
+ * of synching.
+ */
+ rds_ib_send_complete(container_of(op, struct rds_message, rdma),
+ wc_status, rds_rdma_send_complete);
+
+ if (op->op_write)
+ rds_stats_add(s_send_rdma_bytes, op->op_bytes);
+ else
+ rds_stats_add(s_recv_rdma_bytes, op->op_bytes);
}
-static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
- struct rds_ib_send_work *send,
- int wc_status)
+static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
+ struct rm_atomic_op *op,
+ int wc_status)
{
- struct rds_message *rm = send->s_rm;
-
- rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
-
- ib_dma_unmap_sg(ic->i_cm_id->device,
- rm->m_sg, rm->m_nents,
- DMA_TO_DEVICE);
-
- if (rm->m_rdma_op != NULL) {
- rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
-
- /* If the user asked for a completion notification on this
- * message, we can implement three different semantics:
- * 1. Notify when we received the ACK on the RDS message
- * that was queued with the RDMA. This provides reliable
- * notification of RDMA status at the expense of a one-way
- * packet delay.
- * 2. Notify when the IB stack gives us the completion event for
- * the RDMA operation.
- * 3. Notify when the IB stack gives us the completion event for
- * the accompanying RDS messages.
- * Here, we implement approach #3. To implement approach #2,
- * call rds_rdma_send_complete from the cq_handler. To implement #1,
- * don't call rds_rdma_send_complete at all, and fall back to the notify
- * handling in the ACK processing code.
- *
- * Note: There's no need to explicitly sync any RDMA buffers using
- * ib_dma_sync_sg_for_cpu - the completion for the RDMA
- * operation itself unmapped the RDMA buffers, which takes care
- * of synching.
- */
- rds_ib_send_rdma_complete(rm, wc_status);
+ /* unmap atomic recvbuf */
+ if (op->op_mapped) {
+ ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1,
+ DMA_FROM_DEVICE);
+ op->op_mapped = 0;
+ }
- if (rm->m_rdma_op->r_write)
- rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
- else
- rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
+ rds_ib_send_complete(container_of(op, struct rds_message, atomic),
+ wc_status, rds_atomic_send_complete);
+
+ if (op->op_type == RDS_ATOMIC_TYPE_CSWP)
+ rds_ib_stats_inc(s_ib_atomic_cswp);
+ else
+ rds_ib_stats_inc(s_ib_atomic_fadd);
+}
+
+/*
+ * Unmap the resources associated with a struct send_work.
+ *
+ * Returns the rm for no good reason other than it is unobtainable
+ * other than by switching on wr.opcode, currently, and the caller,
+ * the event handler, needs it.
+ */
+static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic,
+ struct rds_ib_send_work *send,
+ int wc_status)
+{
+ struct rds_message *rm = NULL;
+
+ /* In the error case, wc.opcode sometimes contains garbage */
+ switch (send->s_wr.opcode) {
+ case IB_WR_SEND:
+ if (send->s_op) {
+ rm = container_of(send->s_op, struct rds_message, data);
+ rds_ib_send_unmap_data(ic, send->s_op, wc_status);
+ }
+ break;
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_READ:
+ if (send->s_op) {
+ rm = container_of(send->s_op, struct rds_message, rdma);
+ rds_ib_send_unmap_rdma(ic, send->s_op, wc_status);
+ }
+ break;
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ if (send->s_op) {
+ rm = container_of(send->s_op, struct rds_message, atomic);
+ rds_ib_send_unmap_atomic(ic, send->s_op, wc_status);
+ }
+ break;
+ default:
+ if (printk_ratelimit())
+ printk(KERN_NOTICE
+ "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
+ __func__, send->s_wr.opcode);
+ break;
}
- /* If anyone waited for this message to get flushed out, wake
- * them up now */
- rds_message_unmapped(rm);
+ send->s_wr.opcode = 0xdead;
- rds_message_put(rm);
- send->s_rm = NULL;
+ return rm;
}
void rds_ib_send_init_ring(struct rds_ib_connection *ic)
@@ -133,23 +227,18 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
struct ib_sge *sge;
- send->s_rm = NULL;
send->s_op = NULL;
send->s_wr.wr_id = i;
send->s_wr.sg_list = send->s_sge;
- send->s_wr.num_sge = 1;
- send->s_wr.opcode = IB_WR_SEND;
- send->s_wr.send_flags = 0;
send->s_wr.ex.imm_data = 0;
- sge = rds_ib_data_sge(ic, send->s_sge);
- sge->lkey = ic->i_mr->lkey;
-
- sge = rds_ib_header_sge(ic, send->s_sge);
+ sge = &send->s_sge[0];
sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
sge->length = sizeof(struct rds_header);
sge->lkey = ic->i_mr->lkey;
+
+ send->s_sge[1].lkey = ic->i_mr->lkey;
}
}
@@ -159,16 +248,24 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
u32 i;
for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
- if (send->s_wr.opcode == 0xdead)
- continue;
- if (send->s_rm)
- rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
- if (send->s_op)
- rds_ib_send_unmap_rdma(ic, send->s_op);
+ if (send->s_op && send->s_wr.opcode != 0xdead)
+ rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR);
}
}
/*
+ * The only fast path caller always has a non-zero nr, so we don't
+ * bother testing nr before performing the atomic sub.
+ */
+static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr)
+{
+ if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) &&
+ waitqueue_active(&rds_ib_ring_empty_wait))
+ wake_up(&rds_ib_ring_empty_wait);
+ BUG_ON(atomic_read(&ic->i_signaled_sends) < 0);
+}
+
+/*
* The _oldest/_free ring operations here race cleanly with the alloc/unalloc
* operations performed in the send path. As the sender allocs and potentially
* unallocs the next free entry in the ring it doesn't alter which is
@@ -178,12 +275,14 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
{
struct rds_connection *conn = context;
struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_message *rm = NULL;
struct ib_wc wc;
struct rds_ib_send_work *send;
u32 completed;
u32 oldest;
u32 i = 0;
int ret;
+ int nr_sig = 0;
rdsdebug("cq %p conn %p\n", cq, conn);
rds_ib_stats_inc(s_ib_tx_cq_call);
@@ -192,8 +291,9 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
while (ib_poll_cq(cq, 1, &wc) > 0) {
- rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
- (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+ rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+ (unsigned long long)wc.wr_id, wc.status,
+ rds_ib_wc_status_str(wc.status), wc.byte_len,
be32_to_cpu(wc.ex.imm_data));
rds_ib_stats_inc(s_ib_tx_cq_event);
@@ -210,51 +310,30 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
for (i = 0; i < completed; i++) {
send = &ic->i_sends[oldest];
+ if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+ nr_sig++;
- /* In the error case, wc.opcode sometimes contains garbage */
- switch (send->s_wr.opcode) {
- case IB_WR_SEND:
- if (send->s_rm)
- rds_ib_send_unmap_rm(ic, send, wc.status);
- break;
- case IB_WR_RDMA_WRITE:
- case IB_WR_RDMA_READ:
- /* Nothing to be done - the SG list will be unmapped
- * when the SEND completes. */
- break;
- default:
- if (printk_ratelimit())
- printk(KERN_NOTICE
- "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
- __func__, send->s_wr.opcode);
- break;
- }
+ rm = rds_ib_send_unmap_op(ic, send, wc.status);
- send->s_wr.opcode = 0xdead;
- send->s_wr.num_sge = 1;
if (send->s_queued + HZ/2 < jiffies)
rds_ib_stats_inc(s_ib_tx_stalled);
- /* If a RDMA operation produced an error, signal this right
- * away. If we don't, the subsequent SEND that goes with this
- * RDMA will be canceled with ERR_WFLUSH, and the application
- * never learn that the RDMA failed. */
- if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
- struct rds_message *rm;
-
- rm = rds_send_get_message(conn, send->s_op);
- if (rm) {
- if (rm->m_rdma_op)
- rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
- rds_ib_send_rdma_complete(rm, wc.status);
- rds_message_put(rm);
+ if (send->s_op) {
+ if (send->s_op == rm->m_final_op) {
+ /* If anyone waited for this message to get flushed out, wake
+ * them up now */
+ rds_message_unmapped(rm);
}
+ rds_message_put(rm);
+ send->s_op = NULL;
}
oldest = (oldest + 1) % ic->i_send_ring.w_nr;
}
rds_ib_ring_free(&ic->i_send_ring, completed);
+ rds_ib_sub_signaled(ic, nr_sig);
+ nr_sig = 0;
if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
test_bit(0, &conn->c_map_queued))
@@ -262,10 +341,10 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
/* We expect errors as the qp is drained during shutdown */
if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
- rds_ib_conn_error(conn,
- "send completion on %pI4 "
- "had status %u, disconnecting and reconnecting\n",
- &conn->c_faddr, wc.status);
+ rds_ib_conn_error(conn, "send completion on %pI4 had status "
+ "%u (%s), disconnecting and reconnecting\n",
+ &conn->c_faddr, wc.status,
+ rds_ib_wc_status_str(wc.status));
}
}
}
@@ -294,7 +373,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
* credits (see rds_ib_send_add_credits below).
*
* The RDS send code is essentially single-threaded; rds_send_xmit
- * grabs c_send_lock to ensure exclusive access to the send ring.
+ * sets RDS_IN_XMIT to ensure exclusive access to the send ring.
* However, the ACK sending code is independent and can race with
* message SENDs.
*
@@ -413,40 +492,21 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
}
-static inline void
-rds_ib_xmit_populate_wr(struct rds_ib_connection *ic,
- struct rds_ib_send_work *send, unsigned int pos,
- unsigned long buffer, unsigned int length,
- int send_flags)
+static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
+ struct rds_ib_send_work *send,
+ bool notify)
{
- struct ib_sge *sge;
-
- WARN_ON(pos != send - ic->i_sends);
-
- send->s_wr.send_flags = send_flags;
- send->s_wr.opcode = IB_WR_SEND;
- send->s_wr.num_sge = 2;
- send->s_wr.next = NULL;
- send->s_queued = jiffies;
- send->s_op = NULL;
-
- if (length != 0) {
- sge = rds_ib_data_sge(ic, send->s_sge);
- sge->addr = buffer;
- sge->length = length;
- sge->lkey = ic->i_mr->lkey;
-
- sge = rds_ib_header_sge(ic, send->s_sge);
- } else {
- /* We're sending a packet with no payload. There is only
- * one SGE */
- send->s_wr.num_sge = 1;
- sge = &send->s_sge[0];
+ /*
+ * We want to delay signaling completions just enough to get
+ * the batching benefits but not so much that we create dead time
+ * on the wire.
+ */
+ if (ic->i_unsignaled_wrs-- == 0 || notify) {
+ ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+ send->s_wr.send_flags |= IB_SEND_SIGNALED;
+ return 1;
}
-
- sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
- sge->length = sizeof(struct rds_header);
- sge->lkey = ic->i_mr->lkey;
+ return 0;
}
/*
@@ -475,13 +535,14 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
u32 pos;
u32 i;
u32 work_alloc;
- u32 credit_alloc;
+ u32 credit_alloc = 0;
u32 posted;
u32 adv_credits = 0;
int send_flags = 0;
- int sent;
+ int bytes_sent = 0;
int ret;
int flow_controlled = 0;
+ int nr_sig = 0;
BUG_ON(off % RDS_FRAG_SIZE);
BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
@@ -507,14 +568,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
goto out;
}
- credit_alloc = work_alloc;
if (ic->i_flowctl) {
credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
adv_credits += posted;
if (credit_alloc < work_alloc) {
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
work_alloc = credit_alloc;
- flow_controlled++;
+ flow_controlled = 1;
}
if (work_alloc == 0) {
set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
@@ -525,31 +585,25 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
}
/* map the message the first time we see it */
- if (ic->i_rm == NULL) {
- /*
- printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n",
- be16_to_cpu(rm->m_inc.i_hdr.h_dport),
- rm->m_inc.i_hdr.h_flags,
- be32_to_cpu(rm->m_inc.i_hdr.h_len));
- */
- if (rm->m_nents) {
- rm->m_count = ib_dma_map_sg(dev,
- rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
- rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
- if (rm->m_count == 0) {
+ if (!ic->i_data_op) {
+ if (rm->data.op_nents) {
+ rm->data.op_count = ib_dma_map_sg(dev,
+ rm->data.op_sg,
+ rm->data.op_nents,
+ DMA_TO_DEVICE);
+ rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
+ if (rm->data.op_count == 0) {
rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
ret = -ENOMEM; /* XXX ? */
goto out;
}
} else {
- rm->m_count = 0;
+ rm->data.op_count = 0;
}
- ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
- ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
rds_message_addref(rm);
- ic->i_rm = rm;
+ ic->i_data_op = &rm->data;
/* Finalize the header */
if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
@@ -559,10 +613,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
/* If it has a RDMA op, tell the peer we did it. This is
* used by the peer to release use-once RDMA MRs. */
- if (rm->m_rdma_op) {
+ if (rm->rdma.op_active) {
struct rds_ext_header_rdma ext_hdr;
- ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
+ ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
}
@@ -582,99 +636,77 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
/*
* Update adv_credits since we reset the ACK_REQUIRED bit.
*/
- rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
- adv_credits += posted;
- BUG_ON(adv_credits > 255);
+ if (ic->i_flowctl) {
+ rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
+ adv_credits += posted;
+ BUG_ON(adv_credits > 255);
+ }
}
- send = &ic->i_sends[pos];
- first = send;
- prev = NULL;
- scat = &rm->m_sg[sg];
- sent = 0;
- i = 0;
-
/* Sometimes you want to put a fence between an RDMA
* READ and the following SEND.
* We could either do this all the time
* or when requested by the user. Right now, we let
* the application choose.
*/
- if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
+ if (rm->rdma.op_active && rm->rdma.op_fence)
send_flags = IB_SEND_FENCE;
- /*
- * We could be copying the header into the unused tail of the page.
- * That would need to be changed in the future when those pages might
- * be mapped userspace pages or page cache pages. So instead we always
- * use a second sge and our long-lived ring of mapped headers. We send
- * the header after the data so that the data payload can be aligned on
- * the receiver.
- */
+ /* Each frag gets a header. Msgs may be 0 bytes */
+ send = &ic->i_sends[pos];
+ first = send;
+ prev = NULL;
+ scat = &ic->i_data_op->op_sg[sg];
+ i = 0;
+ do {
+ unsigned int len = 0;
- /* handle a 0-len message */
- if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
- rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
- goto add_header;
- }
+ /* Set up the header */
+ send->s_wr.send_flags = send_flags;
+ send->s_wr.opcode = IB_WR_SEND;
+ send->s_wr.num_sge = 1;
+ send->s_wr.next = NULL;
+ send->s_queued = jiffies;
+ send->s_op = NULL;
- /* if there's data reference it with a chain of work reqs */
- for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
- unsigned int len;
+ send->s_sge[0].addr = ic->i_send_hdrs_dma
+ + (pos * sizeof(struct rds_header));
+ send->s_sge[0].length = sizeof(struct rds_header);
- send = &ic->i_sends[pos];
+ memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
- len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
- rds_ib_xmit_populate_wr(ic, send, pos,
- ib_sg_dma_address(dev, scat) + off, len,
- send_flags);
+ /* Set up the data, if present */
+ if (i < work_alloc
+ && scat != &rm->data.op_sg[rm->data.op_count]) {
+ len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+ send->s_wr.num_sge = 2;
- /*
- * We want to delay signaling completions just enough to get
- * the batching benefits but not so much that we create dead time
- * on the wire.
- */
- if (ic->i_unsignaled_wrs-- == 0) {
- ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
- send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
- }
+ send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off;
+ send->s_sge[1].length = len;
- ic->i_unsignaled_bytes -= len;
- if (ic->i_unsignaled_bytes <= 0) {
- ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
- send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ bytes_sent += len;
+ off += len;
+ if (off == ib_sg_dma_len(dev, scat)) {
+ scat++;
+ off = 0;
+ }
}
+ rds_ib_set_wr_signal_state(ic, send, 0);
+
/*
* Always signal the last one if we're stopping due to flow control.
*/
- if (flow_controlled && i == (work_alloc-1))
+ if (ic->i_flowctl && flow_controlled && i == (work_alloc-1))
send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+ nr_sig++;
+
rdsdebug("send %p wr %p num_sge %u next %p\n", send,
&send->s_wr, send->s_wr.num_sge, send->s_wr.next);
- sent += len;
- off += len;
- if (off == ib_sg_dma_len(dev, scat)) {
- scat++;
- off = 0;
- }
-
-add_header:
- /* Tack on the header after the data. The header SGE should already
- * have been set up to point to the right header buffer. */
- memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
-
- if (0) {
- struct rds_header *hdr = &ic->i_send_hdrs[pos];
-
- printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
- be16_to_cpu(hdr->h_dport),
- hdr->h_flags,
- be32_to_cpu(hdr->h_len));
- }
- if (adv_credits) {
+ if (ic->i_flowctl && adv_credits) {
struct rds_header *hdr = &ic->i_send_hdrs[pos];
/* add credit and redo the header checksum */
@@ -689,20 +721,25 @@ add_header:
prev = send;
pos = (pos + 1) % ic->i_send_ring.w_nr;
- }
+ send = &ic->i_sends[pos];
+ i++;
+
+ } while (i < work_alloc
+ && scat != &rm->data.op_sg[rm->data.op_count]);
/* Account the RDS header in the number of bytes we sent, but just once.
* The caller has no concept of fragmentation. */
if (hdr_off == 0)
- sent += sizeof(struct rds_header);
+ bytes_sent += sizeof(struct rds_header);
/* if we finished the message then send completion owns it */
- if (scat == &rm->m_sg[rm->m_count]) {
- prev->s_rm = ic->i_rm;
- prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
- ic->i_rm = NULL;
+ if (scat == &rm->data.op_sg[rm->data.op_count]) {
+ prev->s_op = ic->i_data_op;
+ prev->s_wr.send_flags |= IB_SEND_SOLICITED;
+ ic->i_data_op = NULL;
}
+ /* Put back wrs & credits we didn't use */
if (i < work_alloc) {
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
work_alloc = i;
@@ -710,6 +747,9 @@ add_header:
if (ic->i_flowctl && i < credit_alloc)
rds_ib_send_add_credits(conn, credit_alloc - i);
+ if (nr_sig)
+ atomic_add(nr_sig, &ic->i_signaled_sends);
+
/* XXX need to worry about failed_wr and partial sends. */
failed_wr = &first->s_wr;
ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
@@ -720,32 +760,127 @@ add_header:
printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
"returned %d\n", &conn->c_faddr, ret);
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
- if (prev->s_rm) {
- ic->i_rm = prev->s_rm;
- prev->s_rm = NULL;
+ rds_ib_sub_signaled(ic, nr_sig);
+ if (prev->s_op) {
+ ic->i_data_op = prev->s_op;
+ prev->s_op = NULL;
}
rds_ib_conn_error(ic->conn, "ib_post_send failed\n");
goto out;
}
- ret = sent;
+ ret = bytes_sent;
out:
BUG_ON(adv_credits);
return ret;
}
-int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
+/*
+ * Issue atomic operation.
+ * A simplified version of the rdma case, we always map 1 SG, and
+ * only 8 bytes, for the return value from the atomic operation.
+ */
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
+{
+ struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_ib_send_work *send = NULL;
+ struct ib_send_wr *failed_wr;
+ struct rds_ib_device *rds_ibdev;
+ u32 pos;
+ u32 work_alloc;
+ int ret;
+ int nr_sig = 0;
+
+ rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+
+ work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
+ if (work_alloc != 1) {
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_ib_stats_inc(s_ib_tx_ring_full);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* address of send request in ring */
+ send = &ic->i_sends[pos];
+ send->s_queued = jiffies;
+
+ if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
+ send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
+ send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare;
+ send->s_wr.wr.atomic.swap = op->op_m_cswp.swap;
+ send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask;
+ send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask;
+ } else { /* FADD */
+ send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
+ send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add;
+ send->s_wr.wr.atomic.swap = 0;
+ send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask;
+ send->s_wr.wr.atomic.swap_mask = 0;
+ }
+ nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
+ send->s_wr.num_sge = 1;
+ send->s_wr.next = NULL;
+ send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
+ send->s_wr.wr.atomic.rkey = op->op_rkey;
+ send->s_op = op;
+ rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
+
+ /* map 8 byte retval buffer to the device */
+ ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
+ rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret);
+ if (ret != 1) {
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+ ret = -ENOMEM; /* XXX ? */
+ goto out;
+ }
+
+ /* Convert our struct scatterlist to struct ib_sge */
+ send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
+ send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
+ send->s_sge[0].lkey = ic->i_mr->lkey;
+
+ rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
+ send->s_sge[0].addr, send->s_sge[0].length);
+
+ if (nr_sig)
+ atomic_add(nr_sig, &ic->i_signaled_sends);
+
+ failed_wr = &send->s_wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr);
+ rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
+ send, &send->s_wr, ret, failed_wr);
+ BUG_ON(failed_wr != &send->s_wr);
+ if (ret) {
+ printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
+ "returned %d\n", &conn->c_faddr, ret);
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_ib_sub_signaled(ic, nr_sig);
+ goto out;
+ }
+
+ if (unlikely(failed_wr != &send->s_wr)) {
+ printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
+ BUG_ON(failed_wr != &send->s_wr);
+ }
+
+out:
+ return ret;
+}
+
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
{
struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_send_work *send = NULL;
struct rds_ib_send_work *first;
struct rds_ib_send_work *prev;
struct ib_send_wr *failed_wr;
- struct rds_ib_device *rds_ibdev;
struct scatterlist *scat;
unsigned long len;
- u64 remote_addr = op->r_remote_addr;
+ u64 remote_addr = op->op_remote_addr;
+ u32 max_sge = ic->rds_ibdev->max_sge;
u32 pos;
u32 work_alloc;
u32 i;
@@ -753,29 +888,28 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
int sent;
int ret;
int num_sge;
-
- rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
-
- /* map the message the first time we see it */
- if (!op->r_mapped) {
- op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
- op->r_sg, op->r_nents, (op->r_write) ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE);
- rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
- if (op->r_count == 0) {
+ int nr_sig = 0;
+
+ /* map the op the first time we see it */
+ if (!op->op_mapped) {
+ op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
+ op->op_sg, op->op_nents, (op->op_write) ?
+ DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
+ if (op->op_count == 0) {
rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
ret = -ENOMEM; /* XXX ? */
goto out;
}
- op->r_mapped = 1;
+ op->op_mapped = 1;
}
/*
* Instead of knowing how to return a partial rdma read/write we insist that there
* be enough work requests to send the entire message.
*/
- i = ceil(op->r_count, rds_ibdev->max_sge);
+ i = ceil(op->op_count, max_sge);
work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
if (work_alloc != i) {
@@ -788,30 +922,24 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
send = &ic->i_sends[pos];
first = send;
prev = NULL;
- scat = &op->r_sg[0];
+ scat = &op->op_sg[0];
sent = 0;
- num_sge = op->r_count;
+ num_sge = op->op_count;
- for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
+ for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
send->s_wr.send_flags = 0;
send->s_queued = jiffies;
- /*
- * We want to delay signaling completions just enough to get
- * the batching benefits but not so much that we create dead time on the wire.
- */
- if (ic->i_unsignaled_wrs-- == 0) {
- ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
- send->s_wr.send_flags = IB_SEND_SIGNALED;
- }
+ send->s_op = NULL;
+
+ nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
- send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+ send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
send->s_wr.wr.rdma.remote_addr = remote_addr;
- send->s_wr.wr.rdma.rkey = op->r_key;
- send->s_op = op;
+ send->s_wr.wr.rdma.rkey = op->op_rkey;
- if (num_sge > rds_ibdev->max_sge) {
- send->s_wr.num_sge = rds_ibdev->max_sge;
- num_sge -= rds_ibdev->max_sge;
+ if (num_sge > max_sge) {
+ send->s_wr.num_sge = max_sge;
+ num_sge -= max_sge;
} else {
send->s_wr.num_sge = num_sge;
}
@@ -821,7 +949,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
if (prev)
prev->s_wr.next = &send->s_wr;
- for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
+ for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
len = ib_sg_dma_len(ic->i_cm_id->device, scat);
send->s_sge[j].addr =
ib_sg_dma_address(ic->i_cm_id->device, scat);
@@ -843,15 +971,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
send = ic->i_sends;
}
- /* if we finished the message then send completion owns it */
- if (scat == &op->r_sg[op->r_count])
- prev->s_wr.send_flags = IB_SEND_SIGNALED;
+ /* give a reference to the last op */
+ if (scat == &op->op_sg[op->op_count]) {
+ prev->s_op = op;
+ rds_message_addref(container_of(op, struct rds_message, rdma));
+ }
if (i < work_alloc) {
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
work_alloc = i;
}
+ if (nr_sig)
+ atomic_add(nr_sig, &ic->i_signaled_sends);
+
failed_wr = &first->s_wr;
ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
@@ -861,6 +994,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
"returned %d\n", &conn->c_faddr, ret);
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+ rds_ib_sub_signaled(ic, nr_sig);
goto out;
}
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index d2c904dd6fb..2d5965d6e97 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -67,6 +67,8 @@ static const char *const rds_ib_stat_names[] = {
"ib_rdma_mr_pool_flush",
"ib_rdma_mr_pool_wait",
"ib_rdma_mr_pool_depleted",
+ "ib_atomic_cswp",
+ "ib_atomic_fadd",
};
unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
index 03f01cb4e0f..1253b006efd 100644
--- a/net/rds/ib_sysctl.c
+++ b/net/rds/ib_sysctl.c
@@ -49,10 +49,6 @@ unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
-unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20);
-static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1;
-static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
-
/*
* This sysctl does nothing.
*
@@ -65,7 +61,7 @@ static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
*/
unsigned int rds_ib_sysctl_flow_control = 0;
-ctl_table rds_ib_sysctl_table[] = {
+static ctl_table rds_ib_sysctl_table[] = {
{
.procname = "max_send_wr",
.data = &rds_ib_sysctl_max_send_wr,
@@ -94,15 +90,6 @@ ctl_table rds_ib_sysctl_table[] = {
.extra2 = &rds_ib_sysctl_max_unsig_wr_max,
},
{
- .procname = "max_unsignaled_bytes",
- .data = &rds_ib_sysctl_max_unsig_bytes,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- .extra1 = &rds_ib_sysctl_max_unsig_bytes_min,
- .extra2 = &rds_ib_sysctl_max_unsig_bytes_max,
- },
- {
.procname = "max_recv_allocation",
.data = &rds_ib_sysctl_max_recv_allocation,
.maxlen = sizeof(unsigned long),
@@ -132,10 +119,10 @@ void rds_ib_sysctl_exit(void)
unregister_sysctl_table(rds_ib_sysctl_hdr);
}
-int __init rds_ib_sysctl_init(void)
+int rds_ib_sysctl_init(void)
{
rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table);
- if (rds_ib_sysctl_hdr == NULL)
+ if (!rds_ib_sysctl_hdr)
return -ENOMEM;
return 0;
}
diff --git a/net/rds/info.c b/net/rds/info.c
index c45c4173a44..4fdf1b6e84f 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -76,7 +76,7 @@ void rds_info_register_func(int optname, rds_info_func func)
BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
spin_lock(&rds_info_lock);
- BUG_ON(rds_info_funcs[offset] != NULL);
+ BUG_ON(rds_info_funcs[offset]);
rds_info_funcs[offset] = func;
spin_unlock(&rds_info_lock);
}
@@ -102,7 +102,7 @@ EXPORT_SYMBOL_GPL(rds_info_deregister_func);
*/
void rds_info_iter_unmap(struct rds_info_iterator *iter)
{
- if (iter->addr != NULL) {
+ if (iter->addr) {
kunmap_atomic(iter->addr, KM_USER0);
iter->addr = NULL;
}
@@ -117,7 +117,7 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data,
unsigned long this;
while (bytes) {
- if (iter->addr == NULL)
+ if (!iter->addr)
iter->addr = kmap_atomic(*iter->pages, KM_USER0);
this = min(bytes, PAGE_SIZE - iter->offset);
@@ -188,7 +188,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
>> PAGE_SHIFT;
pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
- if (pages == NULL) {
+ if (!pages) {
ret = -ENOMEM;
goto out;
}
@@ -206,7 +206,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
call_func:
func = rds_info_funcs[optname - RDS_INFO_FIRST];
- if (func == NULL) {
+ if (!func) {
ret = -ENOPROTOOPT;
goto out;
}
@@ -234,7 +234,7 @@ call_func:
ret = -EFAULT;
out:
- for (i = 0; pages != NULL && i < nr_pages; i++)
+ for (i = 0; pages && i < nr_pages; i++)
put_page(pages[i]);
kfree(pages);
diff --git a/net/rds/iw.c b/net/rds/iw.c
index c8f3d3525cb..5a9676fe594 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -56,7 +56,7 @@ struct list_head rds_iw_devices;
DEFINE_SPINLOCK(iw_nodev_conns_lock);
LIST_HEAD(iw_nodev_conns);
-void rds_iw_add_one(struct ib_device *device)
+static void rds_iw_add_one(struct ib_device *device)
{
struct rds_iw_device *rds_iwdev;
struct ib_device_attr *dev_attr;
@@ -124,7 +124,7 @@ free_attr:
kfree(dev_attr);
}
-void rds_iw_remove_one(struct ib_device *device)
+static void rds_iw_remove_one(struct ib_device *device)
{
struct rds_iw_device *rds_iwdev;
struct rds_iw_cm_id *i_cm_id, *next;
@@ -264,7 +264,6 @@ struct rds_transport rds_iw_transport = {
.laddr_check = rds_iw_laddr_check,
.xmit_complete = rds_iw_xmit_complete,
.xmit = rds_iw_xmit,
- .xmit_cong_map = NULL,
.xmit_rdma = rds_iw_xmit_rdma,
.recv = rds_iw_recv,
.conn_alloc = rds_iw_conn_alloc,
@@ -272,7 +271,6 @@ struct rds_transport rds_iw_transport = {
.conn_connect = rds_iw_conn_connect,
.conn_shutdown = rds_iw_conn_shutdown,
.inc_copy_to_user = rds_iw_inc_copy_to_user,
- .inc_purge = rds_iw_inc_purge,
.inc_free = rds_iw_inc_free,
.cm_initiate_connect = rds_iw_cm_initiate_connect,
.cm_handle_connect = rds_iw_cm_handle_connect,
@@ -289,7 +287,7 @@ struct rds_transport rds_iw_transport = {
.t_prefer_loopback = 1,
};
-int __init rds_iw_init(void)
+int rds_iw_init(void)
{
int ret;
diff --git a/net/rds/iw.h b/net/rds/iw.h
index eef2f0c2847..90151922178 100644
--- a/net/rds/iw.h
+++ b/net/rds/iw.h
@@ -70,7 +70,7 @@ struct rds_iw_send_work {
struct rds_message *s_rm;
/* We should really put these into a union: */
- struct rds_rdma_op *s_op;
+ struct rm_rdma_op *s_op;
struct rds_iw_mapping *s_mapping;
struct ib_mr *s_mr;
struct ib_fast_reg_page_list *s_page_list;
@@ -268,8 +268,6 @@ static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
/* ib.c */
extern struct rds_transport rds_iw_transport;
-extern void rds_iw_add_one(struct ib_device *device);
-extern void rds_iw_remove_one(struct ib_device *device);
extern struct ib_client rds_iw_client;
extern unsigned int fastreg_pool_size;
@@ -284,7 +282,7 @@ void rds_iw_conn_free(void *arg);
int rds_iw_conn_connect(struct rds_connection *conn);
void rds_iw_conn_shutdown(struct rds_connection *conn);
void rds_iw_state_change(struct sock *sk);
-int __init rds_iw_listen_init(void);
+int rds_iw_listen_init(void);
void rds_iw_listen_stop(void);
void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
@@ -318,15 +316,13 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
void rds_iw_sync_mr(void *trans_private, int dir);
void rds_iw_free_mr(void *trans_private, int invalidate);
void rds_iw_flush_mrs(void);
-void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
/* ib_recv.c */
-int __init rds_iw_recv_init(void);
+int rds_iw_recv_init(void);
void rds_iw_recv_exit(void);
int rds_iw_recv(struct rds_connection *conn);
int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
gfp_t page_gfp, int prefill);
-void rds_iw_inc_purge(struct rds_incoming *inc);
void rds_iw_inc_free(struct rds_incoming *inc);
int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
size_t size);
@@ -358,7 +354,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
void rds_iw_send_init_ring(struct rds_iw_connection *ic);
void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
@@ -371,7 +367,7 @@ unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail);
/* ib_sysctl.c */
-int __init rds_iw_sysctl_init(void);
+int rds_iw_sysctl_init(void);
void rds_iw_sysctl_exit(void);
extern unsigned long rds_iw_sysctl_max_send_wr;
extern unsigned long rds_iw_sysctl_max_recv_wr;
@@ -379,7 +375,6 @@ extern unsigned long rds_iw_sysctl_max_unsig_wrs;
extern unsigned long rds_iw_sysctl_max_unsig_bytes;
extern unsigned long rds_iw_sysctl_max_recv_allocation;
extern unsigned int rds_iw_sysctl_flow_control;
-extern ctl_table rds_iw_sysctl_table[];
/*
* Helper functions for getting/setting the header and data SGEs in
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
index b5dd6ac39be..712cf2d1f28 100644
--- a/net/rds/iw_cm.c
+++ b/net/rds/iw_cm.c
@@ -257,7 +257,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
* the rds_iwdev at all.
*/
rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
- if (rds_iwdev == NULL) {
+ if (!rds_iwdev) {
if (printk_ratelimit())
printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
dev->name);
@@ -292,7 +292,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
ic->i_send_ring.w_nr *
sizeof(struct rds_header),
&ic->i_send_hdrs_dma, GFP_KERNEL);
- if (ic->i_send_hdrs == NULL) {
+ if (!ic->i_send_hdrs) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent send failed\n");
goto out;
@@ -302,7 +302,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
ic->i_recv_ring.w_nr *
sizeof(struct rds_header),
&ic->i_recv_hdrs_dma, GFP_KERNEL);
- if (ic->i_recv_hdrs == NULL) {
+ if (!ic->i_recv_hdrs) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent recv failed\n");
goto out;
@@ -310,14 +310,14 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
&ic->i_ack_dma, GFP_KERNEL);
- if (ic->i_ack == NULL) {
+ if (!ic->i_ack) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent ack failed\n");
goto out;
}
ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
- if (ic->i_sends == NULL) {
+ if (!ic->i_sends) {
ret = -ENOMEM;
rdsdebug("send allocation failed\n");
goto out;
@@ -325,7 +325,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
rds_iw_send_init_ring(ic);
ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
- if (ic->i_recvs == NULL) {
+ if (!ic->i_recvs) {
ret = -ENOMEM;
rdsdebug("recv allocation failed\n");
goto out;
@@ -696,7 +696,7 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
/* XXX too lazy? */
ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL);
- if (ic == NULL)
+ if (!ic)
return -ENOMEM;
INIT_LIST_HEAD(&ic->iw_node);
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
index 13dc1862d86..59509e9a9e7 100644
--- a/net/rds/iw_rdma.c
+++ b/net/rds/iw_rdma.c
@@ -34,7 +34,6 @@
#include <linux/slab.h>
#include "rds.h"
-#include "rdma.h"
#include "iw.h"
@@ -158,7 +157,8 @@ static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *
return 0;
}
-void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
+static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev,
+ struct rdma_cm_id *cm_id)
{
struct rds_iw_cm_id *i_cm_id;
@@ -207,9 +207,9 @@ void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *con
BUG_ON(list_empty(&ic->iw_node));
list_del(&ic->iw_node);
- spin_lock_irq(&rds_iwdev->spinlock);
+ spin_lock(&rds_iwdev->spinlock);
list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
- spin_unlock_irq(&rds_iwdev->spinlock);
+ spin_unlock(&rds_iwdev->spinlock);
spin_unlock_irq(&iw_nodev_conns_lock);
ic->rds_iwdev = rds_iwdev;
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
index 3d479067d54..5e57347f49f 100644
--- a/net/rds/iw_recv.c
+++ b/net/rds/iw_recv.c
@@ -53,7 +53,7 @@ static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
static void rds_iw_frag_free(struct rds_page_frag *frag)
{
rdsdebug("frag %p page %p\n", frag, frag->f_page);
- BUG_ON(frag->f_page != NULL);
+ BUG_ON(frag->f_page);
kmem_cache_free(rds_iw_frag_slab, frag);
}
@@ -143,14 +143,14 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn,
struct ib_sge *sge;
int ret = -ENOMEM;
- if (recv->r_iwinc == NULL) {
+ if (!recv->r_iwinc) {
if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) {
rds_iw_stats_inc(s_iw_rx_alloc_limit);
goto out;
}
recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
kptr_gfp);
- if (recv->r_iwinc == NULL) {
+ if (!recv->r_iwinc) {
atomic_dec(&rds_iw_allocation);
goto out;
}
@@ -158,17 +158,17 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn,
rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
}
- if (recv->r_frag == NULL) {
+ if (!recv->r_frag) {
recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
- if (recv->r_frag == NULL)
+ if (!recv->r_frag)
goto out;
INIT_LIST_HEAD(&recv->r_frag->f_item);
recv->r_frag->f_page = NULL;
}
- if (ic->i_frag.f_page == NULL) {
+ if (!ic->i_frag.f_page) {
ic->i_frag.f_page = alloc_page(page_gfp);
- if (ic->i_frag.f_page == NULL)
+ if (!ic->i_frag.f_page)
goto out;
ic->i_frag.f_offset = 0;
}
@@ -273,7 +273,7 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
return ret;
}
-void rds_iw_inc_purge(struct rds_incoming *inc)
+static void rds_iw_inc_purge(struct rds_incoming *inc)
{
struct rds_iw_incoming *iwinc;
struct rds_page_frag *frag;
@@ -716,7 +716,7 @@ static void rds_iw_process_recv(struct rds_connection *conn,
* into the inc and save the inc so we can hang upcoming fragments
* off its list.
*/
- if (iwinc == NULL) {
+ if (!iwinc) {
iwinc = recv->r_iwinc;
recv->r_iwinc = NULL;
ic->i_iwinc = iwinc;
@@ -887,7 +887,7 @@ int rds_iw_recv(struct rds_connection *conn)
return ret;
}
-int __init rds_iw_recv_init(void)
+int rds_iw_recv_init(void)
{
struct sysinfo si;
int ret = -ENOMEM;
@@ -899,13 +899,13 @@ int __init rds_iw_recv_init(void)
rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
sizeof(struct rds_iw_incoming),
0, 0, NULL);
- if (rds_iw_incoming_slab == NULL)
+ if (!rds_iw_incoming_slab)
goto out;
rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
sizeof(struct rds_page_frag),
0, 0, NULL);
- if (rds_iw_frag_slab == NULL)
+ if (!rds_iw_frag_slab)
kmem_cache_destroy(rds_iw_incoming_slab);
else
ret = 0;
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
index 52182ff7519..6280ea020d4 100644
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -36,7 +36,6 @@
#include <linux/dmapool.h>
#include "rds.h"
-#include "rdma.h"
#include "iw.h"
static void rds_iw_send_rdma_complete(struct rds_message *rm,
@@ -64,13 +63,13 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm,
}
static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
- struct rds_rdma_op *op)
+ struct rm_rdma_op *op)
{
- if (op->r_mapped) {
+ if (op->op_mapped) {
ib_dma_unmap_sg(ic->i_cm_id->device,
- op->r_sg, op->r_nents,
- op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
- op->r_mapped = 0;
+ op->op_sg, op->op_nents,
+ op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ op->op_mapped = 0;
}
}
@@ -83,11 +82,11 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
ib_dma_unmap_sg(ic->i_cm_id->device,
- rm->m_sg, rm->m_nents,
+ rm->data.op_sg, rm->data.op_nents,
DMA_TO_DEVICE);
- if (rm->m_rdma_op != NULL) {
- rds_iw_send_unmap_rdma(ic, rm->m_rdma_op);
+ if (rm->rdma.op_active) {
+ rds_iw_send_unmap_rdma(ic, &rm->rdma);
/* If the user asked for a completion notification on this
* message, we can implement three different semantics:
@@ -111,10 +110,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
*/
rds_iw_send_rdma_complete(rm, wc_status);
- if (rm->m_rdma_op->r_write)
- rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
+ if (rm->rdma.op_write)
+ rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
else
- rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
+ rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
}
/* If anyone waited for this message to get flushed out, wake
@@ -556,25 +555,27 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
}
/* map the message the first time we see it */
- if (ic->i_rm == NULL) {
+ if (!ic->i_rm) {
/*
printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
be16_to_cpu(rm->m_inc.i_hdr.h_dport),
rm->m_inc.i_hdr.h_flags,
be32_to_cpu(rm->m_inc.i_hdr.h_len));
*/
- if (rm->m_nents) {
- rm->m_count = ib_dma_map_sg(dev,
- rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
- rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
- if (rm->m_count == 0) {
+ if (rm->data.op_nents) {
+ rm->data.op_count = ib_dma_map_sg(dev,
+ rm->data.op_sg,
+ rm->data.op_nents,
+ DMA_TO_DEVICE);
+ rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
+ if (rm->data.op_count == 0) {
rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
ret = -ENOMEM; /* XXX ? */
goto out;
}
} else {
- rm->m_count = 0;
+ rm->data.op_count = 0;
}
ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
@@ -590,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
/* If it has a RDMA op, tell the peer we did it. This is
* used by the peer to release use-once RDMA MRs. */
- if (rm->m_rdma_op) {
+ if (rm->rdma.op_active) {
struct rds_ext_header_rdma ext_hdr;
- ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
+ ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
}
@@ -621,7 +622,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
send = &ic->i_sends[pos];
first = send;
prev = NULL;
- scat = &rm->m_sg[sg];
+ scat = &rm->data.op_sg[sg];
sent = 0;
i = 0;
@@ -631,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
* or when requested by the user. Right now, we let
* the application choose.
*/
- if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
+ if (rm->rdma.op_active && rm->rdma.op_fence)
send_flags = IB_SEND_FENCE;
/*
@@ -650,7 +651,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
}
/* if there's data reference it with a chain of work reqs */
- for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
+ for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) {
unsigned int len;
send = &ic->i_sends[pos];
@@ -728,7 +729,7 @@ add_header:
sent += sizeof(struct rds_header);
/* if we finished the message then send completion owns it */
- if (scat == &rm->m_sg[rm->m_count]) {
+ if (scat == &rm->data.op_sg[rm->data.op_count]) {
prev->s_rm = ic->i_rm;
prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
ic->i_rm = NULL;
@@ -784,7 +785,7 @@ static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rd
ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
}
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
{
struct rds_iw_connection *ic = conn->c_transport_data;
struct rds_iw_send_work *send = NULL;
@@ -794,7 +795,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
struct rds_iw_device *rds_iwdev;
struct scatterlist *scat;
unsigned long len;
- u64 remote_addr = op->r_remote_addr;
+ u64 remote_addr = op->op_remote_addr;
u32 pos, fr_pos;
u32 work_alloc;
u32 i;
@@ -806,21 +807,21 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
/* map the message the first time we see it */
- if (!op->r_mapped) {
- op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
- op->r_sg, op->r_nents, (op->r_write) ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE);
- rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
- if (op->r_count == 0) {
+ if (!op->op_mapped) {
+ op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
+ op->op_sg, op->op_nents, (op->op_write) ?
+ DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
+ if (op->op_count == 0) {
rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
ret = -ENOMEM; /* XXX ? */
goto out;
}
- op->r_mapped = 1;
+ op->op_mapped = 1;
}
- if (!op->r_write) {
+ if (!op->op_write) {
/* Alloc space on the send queue for the fastreg */
work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
if (work_alloc != 1) {
@@ -835,7 +836,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
* Instead of knowing how to return a partial rdma read/write we insist that there
* be enough work requests to send the entire message.
*/
- i = ceil(op->r_count, rds_iwdev->max_sge);
+ i = ceil(op->op_count, rds_iwdev->max_sge);
work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
if (work_alloc != i) {
@@ -846,17 +847,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
}
send = &ic->i_sends[pos];
- if (!op->r_write) {
+ if (!op->op_write) {
first = prev = &ic->i_sends[fr_pos];
} else {
first = send;
prev = NULL;
}
- scat = &op->r_sg[0];
+ scat = &op->op_sg[0];
sent = 0;
- num_sge = op->r_count;
+ num_sge = op->op_count;
- for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
+ for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
send->s_wr.send_flags = 0;
send->s_queued = jiffies;
@@ -873,13 +874,13 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
* for local access after RDS is finished with it, using
* IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
*/
- if (op->r_write)
+ if (op->op_write)
send->s_wr.opcode = IB_WR_RDMA_WRITE;
else
send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
send->s_wr.wr.rdma.remote_addr = remote_addr;
- send->s_wr.wr.rdma.rkey = op->r_key;
+ send->s_wr.wr.rdma.rkey = op->op_rkey;
send->s_op = op;
if (num_sge > rds_iwdev->max_sge) {
@@ -893,7 +894,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
if (prev)
prev->s_wr.next = &send->s_wr;
- for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
+ for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
len = ib_sg_dma_len(ic->i_cm_id->device, scat);
if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
@@ -927,7 +928,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
}
/* if we finished the message then send completion owns it */
- if (scat == &op->r_sg[op->r_count])
+ if (scat == &op->op_sg[op->op_count])
first->s_wr.send_flags = IB_SEND_SIGNALED;
if (i < work_alloc) {
@@ -941,9 +942,9 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
* adapters do not allow using the lkey for this at all. To bypass this use a
* fastreg_mr (or possibly a dma_mr)
*/
- if (!op->r_write) {
+ if (!op->op_write) {
rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
- op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
+ op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
work_alloc++;
}
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c
index 1c4428a61a0..e2e47176e72 100644
--- a/net/rds/iw_sysctl.c
+++ b/net/rds/iw_sysctl.c
@@ -55,7 +55,7 @@ static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
unsigned int rds_iw_sysctl_flow_control = 1;
-ctl_table rds_iw_sysctl_table[] = {
+static ctl_table rds_iw_sysctl_table[] = {
{
.procname = "max_send_wr",
.data = &rds_iw_sysctl_max_send_wr,
@@ -122,10 +122,10 @@ void rds_iw_sysctl_exit(void)
unregister_sysctl_table(rds_iw_sysctl_hdr);
}
-int __init rds_iw_sysctl_init(void)
+int rds_iw_sysctl_init(void)
{
rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table);
- if (rds_iw_sysctl_hdr == NULL)
+ if (!rds_iw_sysctl_hdr)
return -ENOMEM;
return 0;
}
diff --git a/net/rds/loop.c b/net/rds/loop.c
index dd987937945..c390156b426 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -61,10 +61,17 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg,
unsigned int off)
{
+ /* Do not send cong updates to loopback */
+ if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
+ rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+ return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
+ }
+
BUG_ON(hdr_off || sg || off);
rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
- rds_message_addref(rm); /* for the inc */
+ /* For the embedded inc. Matching put is in loop_inc_free() */
+ rds_message_addref(rm);
rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
GFP_KERNEL, KM_USER0);
@@ -77,16 +84,14 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len);
}
-static int rds_loop_xmit_cong_map(struct rds_connection *conn,
- struct rds_cong_map *map,
- unsigned long offset)
+/*
+ * See rds_loop_xmit(). Since our inc is embedded in the rm, we
+ * make sure the rm lives at least until the inc is done.
+ */
+static void rds_loop_inc_free(struct rds_incoming *inc)
{
- BUG_ON(offset);
- BUG_ON(map != conn->c_lcong);
-
- rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
-
- return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
+ struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+ rds_message_put(rm);
}
/* we need to at least give the thread something to succeed */
@@ -112,7 +117,7 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
unsigned long flags;
lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL);
- if (lc == NULL)
+ if (!lc)
return -ENOMEM;
INIT_LIST_HEAD(&lc->loop_node);
@@ -169,14 +174,12 @@ void rds_loop_exit(void)
*/
struct rds_transport rds_loop_transport = {
.xmit = rds_loop_xmit,
- .xmit_cong_map = rds_loop_xmit_cong_map,
.recv = rds_loop_recv,
.conn_alloc = rds_loop_conn_alloc,
.conn_free = rds_loop_conn_free,
.conn_connect = rds_loop_conn_connect,
.conn_shutdown = rds_loop_conn_shutdown,
.inc_copy_to_user = rds_message_inc_copy_to_user,
- .inc_purge = rds_message_inc_purge,
- .inc_free = rds_message_inc_free,
+ .inc_free = rds_loop_inc_free,
.t_name = "loopback",
};
diff --git a/net/rds/message.c b/net/rds/message.c
index 9a1d67e001b..a84545dae37 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -34,9 +34,6 @@
#include <linux/slab.h>
#include "rds.h"
-#include "rdma.h"
-
-static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq);
static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_NONE] = 0,
@@ -63,29 +60,31 @@ static void rds_message_purge(struct rds_message *rm)
if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
return;
- for (i = 0; i < rm->m_nents; i++) {
- rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i]));
+ for (i = 0; i < rm->data.op_nents; i++) {
+ rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
/* XXX will have to put_page for page refs */
- __free_page(sg_page(&rm->m_sg[i]));
+ __free_page(sg_page(&rm->data.op_sg[i]));
}
- rm->m_nents = 0;
+ rm->data.op_nents = 0;
- if (rm->m_rdma_op)
- rds_rdma_free_op(rm->m_rdma_op);
- if (rm->m_rdma_mr)
- rds_mr_put(rm->m_rdma_mr);
-}
+ if (rm->rdma.op_active)
+ rds_rdma_free_op(&rm->rdma);
+ if (rm->rdma.op_rdma_mr)
+ rds_mr_put(rm->rdma.op_rdma_mr);
-void rds_message_inc_purge(struct rds_incoming *inc)
-{
- struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
- rds_message_purge(rm);
+ if (rm->atomic.op_active)
+ rds_atomic_free_op(&rm->atomic);
+ if (rm->atomic.op_rdma_mr)
+ rds_mr_put(rm->atomic.op_rdma_mr);
}
void rds_message_put(struct rds_message *rm)
{
rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
-
+ if (atomic_read(&rm->m_refcount) == 0) {
+printk(KERN_CRIT "danger refcount zero on %p\n", rm);
+WARN_ON(1);
+ }
if (atomic_dec_and_test(&rm->m_refcount)) {
BUG_ON(!list_empty(&rm->m_sock_item));
BUG_ON(!list_empty(&rm->m_conn_item));
@@ -96,12 +95,6 @@ void rds_message_put(struct rds_message *rm)
}
EXPORT_SYMBOL_GPL(rds_message_put);
-void rds_message_inc_free(struct rds_incoming *inc)
-{
- struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
- rds_message_put(rm);
-}
-
void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
__be16 dport, u64 seq)
{
@@ -113,8 +106,8 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
}
EXPORT_SYMBOL_GPL(rds_message_populate_header);
-int rds_message_add_extension(struct rds_header *hdr,
- unsigned int type, const void *data, unsigned int len)
+int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
+ const void *data, unsigned int len)
{
unsigned int ext_len = sizeof(u8) + len;
unsigned char *dst;
@@ -184,26 +177,6 @@ none:
return RDS_EXTHDR_NONE;
}
-int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version)
-{
- struct rds_ext_header_version ext_hdr;
-
- ext_hdr.h_version = cpu_to_be32(version);
- return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr));
-}
-
-int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version)
-{
- struct rds_ext_header_version ext_hdr;
- unsigned int pos = 0, len = sizeof(ext_hdr);
-
- /* We assume the version extension is the only one present */
- if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION)
- return 0;
- *version = be32_to_cpu(ext_hdr.h_version);
- return 1;
-}
-
int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
{
struct rds_ext_header_rdma_dest ext_hdr;
@@ -214,41 +187,68 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o
}
EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
-struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp)
+/*
+ * Each rds_message is allocated with extra space for the scatterlist entries
+ * rds ops will need. This is to minimize memory allocation count. Then, each rds op
+ * can grab SGs when initializing its part of the rds_message.
+ */
+struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
{
struct rds_message *rm;
- rm = kzalloc(sizeof(struct rds_message) +
- (nents * sizeof(struct scatterlist)), gfp);
+ rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp);
if (!rm)
goto out;
- if (nents)
- sg_init_table(rm->m_sg, nents);
+ rm->m_used_sgs = 0;
+ rm->m_total_sgs = extra_len / sizeof(struct scatterlist);
+
atomic_set(&rm->m_refcount, 1);
INIT_LIST_HEAD(&rm->m_sock_item);
INIT_LIST_HEAD(&rm->m_conn_item);
spin_lock_init(&rm->m_rs_lock);
+ init_waitqueue_head(&rm->m_flush_wait);
out:
return rm;
}
+/*
+ * RDS ops use this to grab SG entries from the rm's sg pool.
+ */
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
+{
+ struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
+ struct scatterlist *sg_ret;
+
+ WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs);
+ WARN_ON(!nents);
+
+ sg_ret = &sg_first[rm->m_used_sgs];
+ sg_init_table(sg_ret, nents);
+ rm->m_used_sgs += nents;
+
+ return sg_ret;
+}
+
struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
{
struct rds_message *rm;
unsigned int i;
+ int num_sgs = ceil(total_len, PAGE_SIZE);
+ int extra_bytes = num_sgs * sizeof(struct scatterlist);
- rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
- if (rm == NULL)
+ rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
+ if (!rm)
return ERR_PTR(-ENOMEM);
set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
- rm->m_nents = ceil(total_len, PAGE_SIZE);
+ rm->data.op_nents = ceil(total_len, PAGE_SIZE);
+ rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
- for (i = 0; i < rm->m_nents; ++i) {
- sg_set_page(&rm->m_sg[i],
+ for (i = 0; i < rm->data.op_nents; ++i) {
+ sg_set_page(&rm->data.op_sg[i],
virt_to_page(page_addrs[i]),
PAGE_SIZE, 0);
}
@@ -256,40 +256,33 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
return rm;
}
-struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
size_t total_len)
{
unsigned long to_copy;
unsigned long iov_off;
unsigned long sg_off;
- struct rds_message *rm;
struct iovec *iov;
struct scatterlist *sg;
- int ret;
-
- rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
- if (rm == NULL) {
- ret = -ENOMEM;
- goto out;
- }
+ int ret = 0;
rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
/*
* now allocate and copy in the data payload.
*/
- sg = rm->m_sg;
+ sg = rm->data.op_sg;
iov = first_iov;
iov_off = 0;
sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
while (total_len) {
- if (sg_page(sg) == NULL) {
+ if (!sg_page(sg)) {
ret = rds_page_remainder_alloc(sg, total_len,
GFP_HIGHUSER);
if (ret)
goto out;
- rm->m_nents++;
+ rm->data.op_nents++;
sg_off = 0;
}
@@ -320,14 +313,8 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
sg++;
}
- ret = 0;
out:
- if (ret) {
- if (rm)
- rds_message_put(rm);
- rm = ERR_PTR(ret);
- }
- return rm;
+ return ret;
}
int rds_message_inc_copy_to_user(struct rds_incoming *inc,
@@ -348,7 +335,7 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc,
iov = first_iov;
iov_off = 0;
- sg = rm->m_sg;
+ sg = rm->data.op_sg;
vec_off = 0;
copied = 0;
@@ -394,15 +381,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc,
*/
void rds_message_wait(struct rds_message *rm)
{
- wait_event(rds_message_flush_waitq,
+ wait_event_interruptible(rm->m_flush_wait,
!test_bit(RDS_MSG_MAPPED, &rm->m_flags));
}
void rds_message_unmapped(struct rds_message *rm)
{
clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
- if (waitqueue_active(&rds_message_flush_waitq))
- wake_up(&rds_message_flush_waitq);
+ wake_up_interruptible(&rm->m_flush_wait);
}
EXPORT_SYMBOL_GPL(rds_message_unmapped);
diff --git a/net/rds/page.c b/net/rds/page.c
index 595a952d4b1..d8acdebe3c7 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -40,7 +40,8 @@ struct rds_page_remainder {
unsigned long r_offset;
};
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder,
+ rds_page_remainders);
/*
* returns 0 on success or -errno on failure.
@@ -57,30 +58,17 @@ int rds_page_copy_user(struct page *page, unsigned long offset,
unsigned long ret;
void *addr;
- if (to_user)
+ addr = kmap(page);
+ if (to_user) {
rds_stats_add(s_copy_to_user, bytes);
- else
+ ret = copy_to_user(ptr, addr + offset, bytes);
+ } else {
rds_stats_add(s_copy_from_user, bytes);
-
- addr = kmap_atomic(page, KM_USER0);
- if (to_user)
- ret = __copy_to_user_inatomic(ptr, addr + offset, bytes);
- else
- ret = __copy_from_user_inatomic(addr + offset, ptr, bytes);
- kunmap_atomic(addr, KM_USER0);
-
- if (ret) {
- addr = kmap(page);
- if (to_user)
- ret = copy_to_user(ptr, addr + offset, bytes);
- else
- ret = copy_from_user(addr + offset, ptr, bytes);
- kunmap(page);
- if (ret)
- return -EFAULT;
+ ret = copy_from_user(addr + offset, ptr, bytes);
}
+ kunmap(page);
- return 0;
+ return ret ? -EFAULT : 0;
}
EXPORT_SYMBOL_GPL(rds_page_copy_user);
@@ -116,7 +104,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
/* jump straight to allocation if we're trying for a huge page */
if (bytes >= PAGE_SIZE) {
page = alloc_page(gfp);
- if (page == NULL) {
+ if (!page) {
ret = -ENOMEM;
} else {
sg_set_page(scat, page, PAGE_SIZE, 0);
@@ -162,7 +150,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
rem = &per_cpu(rds_page_remainders, get_cpu());
local_irq_save(flags);
- if (page == NULL) {
+ if (!page) {
ret = -ENOMEM;
break;
}
@@ -186,6 +174,7 @@ out:
ret ? 0 : scat->length);
return ret;
}
+EXPORT_SYMBOL_GPL(rds_page_remainder_alloc);
static int rds_page_remainder_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 75fd13bb631..1a41debca1c 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -35,7 +35,7 @@
#include <linux/rbtree.h>
#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
-#include "rdma.h"
+#include "rds.h"
/*
* XXX
@@ -130,14 +130,22 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
{
struct rds_mr *mr;
struct rb_node *node;
+ unsigned long flags;
/* Release any MRs associated with this socket */
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
while ((node = rb_first(&rs->rs_rdma_keys))) {
mr = container_of(node, struct rds_mr, r_rb_node);
if (mr->r_trans == rs->rs_transport)
mr->r_invalidate = 0;
+ rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+ RB_CLEAR_NODE(&mr->r_rb_node);
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+ rds_destroy_mr(mr);
rds_mr_put(mr);
+ spin_lock_irqsave(&rs->rs_rdma_lock, flags);
}
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
if (rs->rs_transport && rs->rs_transport->flush_mrs)
rs->rs_transport->flush_mrs();
@@ -181,7 +189,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
goto out;
}
- if (rs->rs_transport->get_mr == NULL) {
+ if (!rs->rs_transport->get_mr) {
ret = -EOPNOTSUPP;
goto out;
}
@@ -197,13 +205,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
/* XXX clamp nr_pages to limit the size of this alloc? */
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
- if (pages == NULL) {
+ if (!pages) {
ret = -ENOMEM;
goto out;
}
mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
- if (mr == NULL) {
+ if (!mr) {
ret = -ENOMEM;
goto out;
}
@@ -230,13 +238,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
* r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
* the zero page.
*/
- ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1);
+ ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
if (ret < 0)
goto out;
nents = ret;
sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
- if (sg == NULL) {
+ if (!sg) {
ret = -ENOMEM;
goto out;
}
@@ -406,68 +414,127 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
- if (mr && (mr->r_use_once || force)) {
+ if (!mr) {
+ printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
+ spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+ return;
+ }
+
+ if (mr->r_use_once || force) {
rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
RB_CLEAR_NODE(&mr->r_rb_node);
zot_me = 1;
- } else if (mr)
- atomic_inc(&mr->r_refcount);
+ }
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
/* May have to issue a dma_sync on this memory region.
* Note we could avoid this if the operation was a RDMA READ,
* but at this point we can't tell. */
- if (mr != NULL) {
- if (mr->r_trans->sync_mr)
- mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
-
- /* If the MR was marked as invalidate, this will
- * trigger an async flush. */
- if (zot_me)
- rds_destroy_mr(mr);
- rds_mr_put(mr);
- }
+ if (mr->r_trans->sync_mr)
+ mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
+
+ /* If the MR was marked as invalidate, this will
+ * trigger an async flush. */
+ if (zot_me)
+ rds_destroy_mr(mr);
+ rds_mr_put(mr);
}
-void rds_rdma_free_op(struct rds_rdma_op *ro)
+void rds_rdma_free_op(struct rm_rdma_op *ro)
{
unsigned int i;
- for (i = 0; i < ro->r_nents; i++) {
- struct page *page = sg_page(&ro->r_sg[i]);
+ for (i = 0; i < ro->op_nents; i++) {
+ struct page *page = sg_page(&ro->op_sg[i]);
/* Mark page dirty if it was possibly modified, which
* is the case for a RDMA_READ which copies from remote
* to local memory */
- if (!ro->r_write) {
- BUG_ON(in_interrupt());
+ if (!ro->op_write) {
+ BUG_ON(irqs_disabled());
set_page_dirty(page);
}
put_page(page);
}
- kfree(ro->r_notifier);
- kfree(ro);
+ kfree(ro->op_notifier);
+ ro->op_notifier = NULL;
+ ro->op_active = 0;
+}
+
+void rds_atomic_free_op(struct rm_atomic_op *ao)
+{
+ struct page *page = sg_page(ao->op_sg);
+
+ /* Mark page dirty if it was possibly modified, which
+ * is the case for a RDMA_READ which copies from remote
+ * to local memory */
+ set_page_dirty(page);
+ put_page(page);
+
+ kfree(ao->op_notifier);
+ ao->op_notifier = NULL;
+ ao->op_active = 0;
}
+
/*
- * args is a pointer to an in-kernel copy in the sendmsg cmsg.
+ * Count the number of pages needed to describe an incoming iovec.
*/
-static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
- struct rds_rdma_args *args)
+static int rds_rdma_pages(struct rds_rdma_args *args)
{
struct rds_iovec vec;
- struct rds_rdma_op *op = NULL;
+ struct rds_iovec __user *local_vec;
+ unsigned int tot_pages = 0;
unsigned int nr_pages;
- unsigned int max_pages;
+ unsigned int i;
+
+ local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
+ /* figure out the number of pages in the vector */
+ for (i = 0; i < args->nr_local; i++) {
+ if (copy_from_user(&vec, &local_vec[i],
+ sizeof(struct rds_iovec)))
+ return -EFAULT;
+
+ nr_pages = rds_pages_in_vec(&vec);
+ if (nr_pages == 0)
+ return -EINVAL;
+
+ tot_pages += nr_pages;
+ }
+
+ return tot_pages;
+}
+
+int rds_rdma_extra_size(struct rds_rdma_args *args)
+{
+ return rds_rdma_pages(args) * sizeof(struct scatterlist);
+}
+
+/*
+ * The application asks for a RDMA transfer.
+ * Extract all arguments and set up the rdma_op
+ */
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg)
+{
+ struct rds_rdma_args *args;
+ struct rds_iovec vec;
+ struct rm_rdma_op *op = &rm->rdma;
+ int nr_pages;
unsigned int nr_bytes;
struct page **pages = NULL;
struct rds_iovec __user *local_vec;
- struct scatterlist *sg;
unsigned int nr;
unsigned int i, j;
- int ret;
+ int ret = 0;
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
+ || rm->rdma.op_active)
+ return -EINVAL;
+
+ args = CMSG_DATA(cmsg);
if (rs->rs_bound_addr == 0) {
ret = -ENOTCONN; /* XXX not a great errno */
@@ -479,61 +546,38 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
goto out;
}
- nr_pages = 0;
- max_pages = 0;
-
- local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
-
- /* figure out the number of pages in the vector */
- for (i = 0; i < args->nr_local; i++) {
- if (copy_from_user(&vec, &local_vec[i],
- sizeof(struct rds_iovec))) {
- ret = -EFAULT;
- goto out;
- }
-
- nr = rds_pages_in_vec(&vec);
- if (nr == 0) {
- ret = -EINVAL;
- goto out;
- }
-
- max_pages = max(nr, max_pages);
- nr_pages += nr;
- }
-
- pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL);
- if (pages == NULL) {
- ret = -ENOMEM;
+ nr_pages = rds_rdma_pages(args);
+ if (nr_pages < 0)
goto out;
- }
- op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL);
- if (op == NULL) {
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ if (!pages) {
ret = -ENOMEM;
goto out;
}
- op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
- op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
- op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
- op->r_recverr = rs->rs_recverr;
+ op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
+ op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
+ op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+ op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
+ op->op_active = 1;
+ op->op_recverr = rs->rs_recverr;
WARN_ON(!nr_pages);
- sg_init_table(op->r_sg, nr_pages);
+ op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
- if (op->r_notify || op->r_recverr) {
+ if (op->op_notify || op->op_recverr) {
/* We allocate an uninitialized notifier here, because
* we don't want to do that in the completion handler. We
* would have to use GFP_ATOMIC there, and don't want to deal
* with failed allocations.
*/
- op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
- if (!op->r_notifier) {
+ op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
+ if (!op->op_notifier) {
ret = -ENOMEM;
goto out;
}
- op->r_notifier->n_user_token = args->user_token;
- op->r_notifier->n_status = RDS_RDMA_SUCCESS;
+ op->op_notifier->n_user_token = args->user_token;
+ op->op_notifier->n_status = RDS_RDMA_SUCCESS;
}
/* The cookie contains the R_Key of the remote memory region, and
@@ -543,15 +587,17 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
* destination address (which is really an offset into the MR)
* FIXME: We may want to move this into ib_rdma.c
*/
- op->r_key = rds_rdma_cookie_key(args->cookie);
- op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
+ op->op_rkey = rds_rdma_cookie_key(args->cookie);
+ op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
nr_bytes = 0;
rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
(unsigned long long)args->nr_local,
(unsigned long long)args->remote_vec.addr,
- op->r_key);
+ op->op_rkey);
+
+ local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
for (i = 0; i < args->nr_local; i++) {
if (copy_from_user(&vec, &local_vec[i],
@@ -569,15 +615,10 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
rs->rs_user_addr = vec.addr;
rs->rs_user_bytes = vec.bytes;
- /* did the user change the vec under us? */
- if (nr > max_pages || op->r_nents + nr > nr_pages) {
- ret = -EINVAL;
- goto out;
- }
/* If it's a WRITE operation, we want to pin the pages for reading.
* If it's a READ operation, we need to pin the pages for writing.
*/
- ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write);
+ ret = rds_pin_pages(vec.addr, nr, pages, !op->op_write);
if (ret < 0)
goto out;
@@ -588,8 +629,9 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
for (j = 0; j < nr; j++) {
unsigned int offset = vec.addr & ~PAGE_MASK;
+ struct scatterlist *sg;
- sg = &op->r_sg[op->r_nents + j];
+ sg = &op->op_sg[op->op_nents + j];
sg_set_page(sg, pages[j],
min_t(unsigned int, vec.bytes, PAGE_SIZE - offset),
offset);
@@ -601,10 +643,9 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
vec.bytes -= sg->length;
}
- op->r_nents += nr;
+ op->op_nents += nr;
}
-
if (nr_bytes > args->remote_vec.bytes) {
rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
nr_bytes,
@@ -612,38 +653,17 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
ret = -EINVAL;
goto out;
}
- op->r_bytes = nr_bytes;
+ op->op_bytes = nr_bytes;
ret = 0;
out:
kfree(pages);
- if (ret) {
- if (op)
- rds_rdma_free_op(op);
- op = ERR_PTR(ret);
- }
- return op;
-}
-
-/*
- * The application asks for a RDMA transfer.
- * Extract all arguments and set up the rdma_op
- */
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
- struct cmsghdr *cmsg)
-{
- struct rds_rdma_op *op;
-
- if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) ||
- rm->m_rdma_op != NULL)
- return -EINVAL;
+ if (ret)
+ rds_rdma_free_op(op);
- op = rds_rdma_prepare(rs, CMSG_DATA(cmsg));
- if (IS_ERR(op))
- return PTR_ERR(op);
rds_stats_inc(s_send_rdma);
- rm->m_rdma_op = op;
- return 0;
+
+ return ret;
}
/*
@@ -673,7 +693,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
- if (mr == NULL)
+ if (!mr)
err = -EINVAL; /* invalid r_key */
else
atomic_inc(&mr->r_refcount);
@@ -681,7 +701,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
if (mr) {
mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
- rm->m_rdma_mr = mr;
+ rm->rdma.op_rdma_mr = mr;
}
return err;
}
@@ -699,5 +719,98 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
rm->m_rdma_cookie != 0)
return -EINVAL;
- return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr);
+ return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr);
+}
+
+/*
+ * Fill in rds_message for an atomic request.
+ */
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg)
+{
+ struct page *page = NULL;
+ struct rds_atomic_args *args;
+ int ret = 0;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
+ || rm->atomic.op_active)
+ return -EINVAL;
+
+ args = CMSG_DATA(cmsg);
+
+ /* Nonmasked & masked cmsg ops converted to masked hw ops */
+ switch (cmsg->cmsg_type) {
+ case RDS_CMSG_ATOMIC_FADD:
+ rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
+ rm->atomic.op_m_fadd.add = args->fadd.add;
+ rm->atomic.op_m_fadd.nocarry_mask = 0;
+ break;
+ case RDS_CMSG_MASKED_ATOMIC_FADD:
+ rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
+ rm->atomic.op_m_fadd.add = args->m_fadd.add;
+ rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
+ break;
+ case RDS_CMSG_ATOMIC_CSWP:
+ rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+ rm->atomic.op_m_cswp.compare = args->cswp.compare;
+ rm->atomic.op_m_cswp.swap = args->cswp.swap;
+ rm->atomic.op_m_cswp.compare_mask = ~0;
+ rm->atomic.op_m_cswp.swap_mask = ~0;
+ break;
+ case RDS_CMSG_MASKED_ATOMIC_CSWP:
+ rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+ rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
+ rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
+ rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
+ rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
+ break;
+ default:
+ BUG(); /* should never happen */
+ }
+
+ rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+ rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
+ rm->atomic.op_active = 1;
+ rm->atomic.op_recverr = rs->rs_recverr;
+ rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
+
+ /* verify 8 byte-aligned */
+ if (args->local_addr & 0x7) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ ret = rds_pin_pages(args->local_addr, 1, &page, 1);
+ if (ret != 1)
+ goto err;
+ ret = 0;
+
+ sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
+
+ if (rm->atomic.op_notify || rm->atomic.op_recverr) {
+ /* We allocate an uninitialized notifier here, because
+ * we don't want to do that in the completion handler. We
+ * would have to use GFP_ATOMIC there, and don't want to deal
+ * with failed allocations.
+ */
+ rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
+ if (!rm->atomic.op_notifier) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ rm->atomic.op_notifier->n_user_token = args->user_token;
+ rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
+ }
+
+ rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
+ rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
+
+ return ret;
+err:
+ if (page)
+ put_page(page);
+ kfree(rm->atomic.op_notifier);
+
+ return ret;
}
diff --git a/net/rds/rdma.h b/net/rds/rdma.h
deleted file mode 100644
index 909c39835a5..00000000000
--- a/net/rds/rdma.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef _RDS_RDMA_H
-#define _RDS_RDMA_H
-
-#include <linux/rbtree.h>
-#include <linux/spinlock.h>
-#include <linux/scatterlist.h>
-
-#include "rds.h"
-
-struct rds_mr {
- struct rb_node r_rb_node;
- atomic_t r_refcount;
- u32 r_key;
-
- /* A copy of the creation flags */
- unsigned int r_use_once:1;
- unsigned int r_invalidate:1;
- unsigned int r_write:1;
-
- /* This is for RDS_MR_DEAD.
- * It would be nice & consistent to make this part of the above
- * bit field here, but we need to use test_and_set_bit.
- */
- unsigned long r_state;
- struct rds_sock *r_sock; /* back pointer to the socket that owns us */
- struct rds_transport *r_trans;
- void *r_trans_private;
-};
-
-/* Flags for mr->r_state */
-#define RDS_MR_DEAD 0
-
-struct rds_rdma_op {
- u32 r_key;
- u64 r_remote_addr;
- unsigned int r_write:1;
- unsigned int r_fence:1;
- unsigned int r_notify:1;
- unsigned int r_recverr:1;
- unsigned int r_mapped:1;
- struct rds_notifier *r_notifier;
- unsigned int r_bytes;
- unsigned int r_nents;
- unsigned int r_count;
- struct scatterlist r_sg[0];
-};
-
-static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
-{
- return r_key | (((u64) offset) << 32);
-}
-
-static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
-{
- return cookie;
-}
-
-static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
-{
- return cookie >> 32;
-}
-
-int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
-int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
-int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
-void rds_rdma_drop_keys(struct rds_sock *rs);
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
- struct cmsghdr *cmsg);
-int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
- struct cmsghdr *cmsg);
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
- struct cmsghdr *cmsg);
-int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
- struct cmsghdr *cmsg);
-void rds_rdma_free_op(struct rds_rdma_op *ro);
-void rds_rdma_send_complete(struct rds_message *rm, int);
-
-extern void __rds_put_mr_final(struct rds_mr *mr);
-static inline void rds_mr_put(struct rds_mr *mr)
-{
- if (atomic_dec_and_test(&mr->r_refcount))
- __rds_put_mr_final(mr);
-}
-
-#endif
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index e599ba2f950..4195a053982 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -36,6 +36,34 @@
static struct rdma_cm_id *rds_rdma_listen_id;
+static char *rds_cm_event_strings[] = {
+#define RDS_CM_EVENT_STRING(foo) \
+ [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo)
+ RDS_CM_EVENT_STRING(ADDR_RESOLVED),
+ RDS_CM_EVENT_STRING(ADDR_ERROR),
+ RDS_CM_EVENT_STRING(ROUTE_RESOLVED),
+ RDS_CM_EVENT_STRING(ROUTE_ERROR),
+ RDS_CM_EVENT_STRING(CONNECT_REQUEST),
+ RDS_CM_EVENT_STRING(CONNECT_RESPONSE),
+ RDS_CM_EVENT_STRING(CONNECT_ERROR),
+ RDS_CM_EVENT_STRING(UNREACHABLE),
+ RDS_CM_EVENT_STRING(REJECTED),
+ RDS_CM_EVENT_STRING(ESTABLISHED),
+ RDS_CM_EVENT_STRING(DISCONNECTED),
+ RDS_CM_EVENT_STRING(DEVICE_REMOVAL),
+ RDS_CM_EVENT_STRING(MULTICAST_JOIN),
+ RDS_CM_EVENT_STRING(MULTICAST_ERROR),
+ RDS_CM_EVENT_STRING(ADDR_CHANGE),
+ RDS_CM_EVENT_STRING(TIMEWAIT_EXIT),
+#undef RDS_CM_EVENT_STRING
+};
+
+static char *rds_cm_event_str(enum rdma_cm_event_type type)
+{
+ return rds_str_array(rds_cm_event_strings,
+ ARRAY_SIZE(rds_cm_event_strings), type);
+};
+
int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event)
{
@@ -44,8 +72,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
struct rds_transport *trans;
int ret = 0;
- rdsdebug("conn %p id %p handling event %u\n", conn, cm_id,
- event->event);
+ rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
+ event->event, rds_cm_event_str(event->event));
if (cm_id->device->node_type == RDMA_NODE_RNIC)
trans = &rds_iw_transport;
@@ -109,7 +137,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
default:
/* things like device disconnect? */
- printk(KERN_ERR "RDS: unknown event %u!\n", event->event);
+ printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
+ event->event, rds_cm_event_str(event->event));
break;
}
@@ -117,12 +146,13 @@ out:
if (conn)
mutex_unlock(&conn->c_cm_lock);
- rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret);
+ rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event,
+ rds_cm_event_str(event->event), ret);
return ret;
}
-static int __init rds_rdma_listen_init(void)
+static int rds_rdma_listen_init(void)
{
struct sockaddr_in sin;
struct rdma_cm_id *cm_id;
@@ -177,7 +207,7 @@ static void rds_rdma_listen_stop(void)
}
}
-int __init rds_rdma_init(void)
+static int rds_rdma_init(void)
{
int ret;
@@ -204,7 +234,7 @@ out:
}
module_init(rds_rdma_init);
-void rds_rdma_exit(void)
+static void rds_rdma_exit(void)
{
/* stop listening first to ensure no new connections are attempted */
rds_rdma_listen_stop();
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
index 2f2c7d976c2..faba4e38269 100644
--- a/net/rds/rdma_transport.h
+++ b/net/rds/rdma_transport.h
@@ -11,10 +11,6 @@ int rds_rdma_conn_connect(struct rds_connection *conn);
int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event);
-/* from rdma_transport.c */
-int rds_rdma_init(void);
-void rds_rdma_exit(void);
-
/* from ib.c */
extern struct rds_transport rds_ib_transport;
int rds_ib_init(void);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index c224b5bb3ba..9542449c072 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -80,6 +80,7 @@ enum {
/* Bits for c_flags */
#define RDS_LL_SEND_FULL 0
#define RDS_RECONNECT_PENDING 1
+#define RDS_IN_XMIT 2
struct rds_connection {
struct hlist_node c_hash_node;
@@ -91,12 +92,13 @@ struct rds_connection {
struct rds_cong_map *c_lcong;
struct rds_cong_map *c_fcong;
- struct mutex c_send_lock; /* protect send ring */
struct rds_message *c_xmit_rm;
unsigned long c_xmit_sg;
unsigned int c_xmit_hdr_off;
unsigned int c_xmit_data_off;
+ unsigned int c_xmit_atomic_sent;
unsigned int c_xmit_rdma_sent;
+ unsigned int c_xmit_data_sent;
spinlock_t c_lock; /* protect msg queues */
u64 c_next_tx_seq;
@@ -116,11 +118,10 @@ struct rds_connection {
struct delayed_work c_conn_w;
struct work_struct c_down_w;
struct mutex c_cm_lock; /* protect conn state & cm */
+ wait_queue_head_t c_waitq;
struct list_head c_map_item;
unsigned long c_map_queued;
- unsigned long c_map_offset;
- unsigned long c_map_bytes;
unsigned int c_unacked_packets;
unsigned int c_unacked_bytes;
@@ -206,6 +207,48 @@ struct rds_incoming {
rds_rdma_cookie_t i_rdma_cookie;
};
+struct rds_mr {
+ struct rb_node r_rb_node;
+ atomic_t r_refcount;
+ u32 r_key;
+
+ /* A copy of the creation flags */
+ unsigned int r_use_once:1;
+ unsigned int r_invalidate:1;
+ unsigned int r_write:1;
+
+ /* This is for RDS_MR_DEAD.
+ * It would be nice & consistent to make this part of the above
+ * bit field here, but we need to use test_and_set_bit.
+ */
+ unsigned long r_state;
+ struct rds_sock *r_sock; /* back pointer to the socket that owns us */
+ struct rds_transport *r_trans;
+ void *r_trans_private;
+};
+
+/* Flags for mr->r_state */
+#define RDS_MR_DEAD 0
+
+static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
+{
+ return r_key | (((u64) offset) << 32);
+}
+
+static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
+{
+ return cookie;
+}
+
+static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
+{
+ return cookie >> 32;
+}
+
+/* atomic operation types */
+#define RDS_ATOMIC_TYPE_CSWP 0
+#define RDS_ATOMIC_TYPE_FADD 1
+
/*
* m_sock_item and m_conn_item are on lists that are serialized under
* conn->c_lock. m_sock_item has additional meaning in that once it is empty
@@ -258,13 +301,71 @@ struct rds_message {
* -> rs->rs_lock
*/
spinlock_t m_rs_lock;
+ wait_queue_head_t m_flush_wait;
+
struct rds_sock *m_rs;
- struct rds_rdma_op *m_rdma_op;
+
+ /* cookie to send to remote, in rds header */
rds_rdma_cookie_t m_rdma_cookie;
- struct rds_mr *m_rdma_mr;
- unsigned int m_nents;
- unsigned int m_count;
- struct scatterlist m_sg[0];
+
+ unsigned int m_used_sgs;
+ unsigned int m_total_sgs;
+
+ void *m_final_op;
+
+ struct {
+ struct rm_atomic_op {
+ int op_type;
+ union {
+ struct {
+ uint64_t compare;
+ uint64_t swap;
+ uint64_t compare_mask;
+ uint64_t swap_mask;
+ } op_m_cswp;
+ struct {
+ uint64_t add;
+ uint64_t nocarry_mask;
+ } op_m_fadd;
+ };
+
+ u32 op_rkey;
+ u64 op_remote_addr;
+ unsigned int op_notify:1;
+ unsigned int op_recverr:1;
+ unsigned int op_mapped:1;
+ unsigned int op_silent:1;
+ unsigned int op_active:1;
+ struct scatterlist *op_sg;
+ struct rds_notifier *op_notifier;
+
+ struct rds_mr *op_rdma_mr;
+ } atomic;
+ struct rm_rdma_op {
+ u32 op_rkey;
+ u64 op_remote_addr;
+ unsigned int op_write:1;
+ unsigned int op_fence:1;
+ unsigned int op_notify:1;
+ unsigned int op_recverr:1;
+ unsigned int op_mapped:1;
+ unsigned int op_silent:1;
+ unsigned int op_active:1;
+ unsigned int op_bytes;
+ unsigned int op_nents;
+ unsigned int op_count;
+ struct scatterlist *op_sg;
+ struct rds_notifier *op_notifier;
+
+ struct rds_mr *op_rdma_mr;
+ } rdma;
+ struct rm_data_op {
+ unsigned int op_active:1;
+ unsigned int op_nents;
+ unsigned int op_count;
+ struct scatterlist *op_sg;
+ } data;
+ };
};
/*
@@ -305,10 +406,6 @@ struct rds_notifier {
* transport is responsible for other serialization, including
* rds_recv_incoming(). This is called in process context but
* should try hard not to block.
- *
- * @xmit_cong_map: This asks the transport to send the local bitmap down the
- * given connection. XXX get a better story about the bitmap
- * flag and header.
*/
#define RDS_TRANS_IB 0
@@ -332,13 +429,11 @@ struct rds_transport {
void (*xmit_complete)(struct rds_connection *conn);
int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off);
- int (*xmit_cong_map)(struct rds_connection *conn,
- struct rds_cong_map *map, unsigned long offset);
- int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
+ int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
+ int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
int (*recv)(struct rds_connection *conn);
int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
size_t size);
- void (*inc_purge)(struct rds_incoming *inc);
void (*inc_free)(struct rds_incoming *inc);
int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
@@ -367,17 +462,11 @@ struct rds_sock {
* bound_addr used for both incoming and outgoing, no INADDR_ANY
* support.
*/
- struct rb_node rs_bound_node;
+ struct hlist_node rs_bound_node;
__be32 rs_bound_addr;
__be32 rs_conn_addr;
__be16 rs_bound_port;
__be16 rs_conn_port;
-
- /*
- * This is only used to communicate the transport between bind and
- * initiating connections. All other trans use is referenced through
- * the connection.
- */
struct rds_transport *rs_transport;
/*
@@ -466,8 +555,8 @@ struct rds_statistics {
uint64_t s_recv_ping;
uint64_t s_send_queue_empty;
uint64_t s_send_queue_full;
- uint64_t s_send_sem_contention;
- uint64_t s_send_sem_queue_raced;
+ uint64_t s_send_lock_contention;
+ uint64_t s_send_lock_queue_raced;
uint64_t s_send_immediate_retry;
uint64_t s_send_delayed_retry;
uint64_t s_send_drop_acked;
@@ -487,6 +576,7 @@ struct rds_statistics {
};
/* af_rds.c */
+char *rds_str_array(char **array, size_t elements, size_t index);
void rds_sock_addref(struct rds_sock *rs);
void rds_sock_put(struct rds_sock *rs);
void rds_wake_sk_sleep(struct rds_sock *rs);
@@ -521,15 +611,16 @@ void rds_cong_exit(void);
struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
/* conn.c */
-int __init rds_conn_init(void);
+int rds_conn_init(void);
void rds_conn_exit(void);
struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp);
struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp);
+void rds_conn_shutdown(struct rds_connection *conn);
void rds_conn_destroy(struct rds_connection *conn);
-void rds_conn_reset(struct rds_connection *conn);
void rds_conn_drop(struct rds_connection *conn);
+void rds_conn_connect_if_down(struct rds_connection *conn);
void rds_for_each_conn_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens,
@@ -566,7 +657,8 @@ rds_conn_connecting(struct rds_connection *conn)
/* message.c */
struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
-struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
+int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
size_t total_len);
struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
@@ -575,12 +667,9 @@ int rds_message_add_extension(struct rds_header *hdr,
unsigned int type, const void *data, unsigned int len);
int rds_message_next_extension(struct rds_header *hdr,
unsigned int *pos, void *buf, unsigned int *buflen);
-int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version);
-int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version);
int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
int rds_message_inc_copy_to_user(struct rds_incoming *inc,
struct iovec *first_iov, size_t size);
-void rds_message_inc_purge(struct rds_incoming *inc);
void rds_message_inc_free(struct rds_incoming *inc);
void rds_message_addref(struct rds_message *rm);
void rds_message_put(struct rds_message *rm);
@@ -614,7 +703,6 @@ void rds_page_exit(void);
/* recv.c */
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
__be32 saddr);
-void rds_inc_addref(struct rds_incoming *inc);
void rds_inc_put(struct rds_incoming *inc);
void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
struct rds_incoming *inc, gfp_t gfp, enum km_type km);
@@ -636,14 +724,38 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
is_acked_func is_acked);
-int rds_send_acked_before(struct rds_connection *conn, u64 seq);
-void rds_send_remove_from_sock(struct list_head *messages, int status);
int rds_send_pong(struct rds_connection *conn, __be16 dport);
struct rds_message *rds_send_get_message(struct rds_connection *,
- struct rds_rdma_op *);
+ struct rm_rdma_op *);
/* rdma.c */
void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
+void rds_rdma_drop_keys(struct rds_sock *rs);
+int rds_rdma_extra_size(struct rds_rdma_args *args);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+void rds_rdma_free_op(struct rm_rdma_op *ro);
+void rds_atomic_free_op(struct rm_atomic_op *ao);
+void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
+void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+
+extern void __rds_put_mr_final(struct rds_mr *mr);
+static inline void rds_mr_put(struct rds_mr *mr)
+{
+ if (atomic_dec_and_test(&mr->r_refcount))
+ __rds_put_mr_final(mr);
+}
/* stats.c */
DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
@@ -657,14 +769,14 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
put_cpu(); \
} while (0)
#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
-int __init rds_stats_init(void);
+int rds_stats_init(void);
void rds_stats_exit(void);
void rds_stats_info_copy(struct rds_info_iterator *iter,
uint64_t *values, const char *const *names,
size_t nr);
/* sysctl.c */
-int __init rds_sysctl_init(void);
+int rds_sysctl_init(void);
void rds_sysctl_exit(void);
extern unsigned long rds_sysctl_sndbuf_min;
extern unsigned long rds_sysctl_sndbuf_default;
@@ -678,9 +790,10 @@ extern unsigned long rds_sysctl_trace_flags;
extern unsigned int rds_sysctl_trace_level;
/* threads.c */
-int __init rds_threads_init(void);
+int rds_threads_init(void);
void rds_threads_exit(void);
extern struct workqueue_struct *rds_wq;
+void rds_queue_reconnect(struct rds_connection *conn);
void rds_connect_worker(struct work_struct *);
void rds_shutdown_worker(struct work_struct *);
void rds_send_worker(struct work_struct *);
@@ -691,9 +804,10 @@ void rds_connect_complete(struct rds_connection *conn);
int rds_trans_register(struct rds_transport *trans);
void rds_trans_unregister(struct rds_transport *trans);
struct rds_transport *rds_trans_get_preferred(__be32 addr);
+void rds_trans_put(struct rds_transport *trans);
unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail);
-int __init rds_trans_init(void);
+int rds_trans_init(void);
void rds_trans_exit(void);
#endif
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 795a00b7f2c..596689e5927 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -36,7 +36,6 @@
#include <linux/in.h>
#include "rds.h"
-#include "rdma.h"
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
__be32 saddr)
@@ -49,12 +48,11 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
}
EXPORT_SYMBOL_GPL(rds_inc_init);
-void rds_inc_addref(struct rds_incoming *inc)
+static void rds_inc_addref(struct rds_incoming *inc)
{
rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
atomic_inc(&inc->i_refcount);
}
-EXPORT_SYMBOL_GPL(rds_inc_addref);
void rds_inc_put(struct rds_incoming *inc)
{
@@ -210,7 +208,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
}
rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
- if (rs == NULL) {
+ if (!rs) {
rds_stats_inc(s_recv_drop_no_sock);
goto out;
}
@@ -251,7 +249,7 @@ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
{
unsigned long flags;
- if (*inc == NULL) {
+ if (!*inc) {
read_lock_irqsave(&rs->rs_recv_lock, flags);
if (!list_empty(&rs->rs_recv_queue)) {
*inc = list_entry(rs->rs_recv_queue.next,
@@ -297,7 +295,7 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
{
struct rds_notifier *notifier;
- struct rds_rdma_notify cmsg;
+ struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */
unsigned int count = 0, max_messages = ~0U;
unsigned long flags;
LIST_HEAD(copy);
@@ -334,10 +332,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
if (msghdr) {
cmsg.user_token = notifier->n_user_token;
- cmsg.status = notifier->n_status;
+ cmsg.status = notifier->n_status;
err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
- sizeof(cmsg), &cmsg);
+ sizeof(cmsg), &cmsg);
if (err)
break;
}
diff --git a/net/rds/send.c b/net/rds/send.c
index 9c1c6bcaa6c..0bc9db17a87 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -37,7 +37,6 @@
#include <linux/list.h>
#include "rds.h"
-#include "rdma.h"
/* When transmitting messages in rds_send_xmit, we need to emerge from
* time to time and briefly release the CPU. Otherwise the softlock watchdog
@@ -53,8 +52,11 @@ static int send_batch_count = 64;
module_param(send_batch_count, int, 0444);
MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
+static void rds_send_remove_from_sock(struct list_head *messages, int status);
+
/*
- * Reset the send state. Caller must hold c_send_lock when calling here.
+ * Reset the send state. Callers must ensure that this doesn't race with
+ * rds_send_xmit().
*/
void rds_send_reset(struct rds_connection *conn)
{
@@ -62,18 +64,22 @@ void rds_send_reset(struct rds_connection *conn)
unsigned long flags;
if (conn->c_xmit_rm) {
+ rm = conn->c_xmit_rm;
+ conn->c_xmit_rm = NULL;
/* Tell the user the RDMA op is no longer mapped by the
* transport. This isn't entirely true (it's flushed out
* independently) but as the connection is down, there's
* no ongoing RDMA to/from that memory */
- rds_message_unmapped(conn->c_xmit_rm);
- rds_message_put(conn->c_xmit_rm);
- conn->c_xmit_rm = NULL;
+ rds_message_unmapped(rm);
+ rds_message_put(rm);
}
+
conn->c_xmit_sg = 0;
conn->c_xmit_hdr_off = 0;
conn->c_xmit_data_off = 0;
+ conn->c_xmit_atomic_sent = 0;
conn->c_xmit_rdma_sent = 0;
+ conn->c_xmit_data_sent = 0;
conn->c_map_queued = 0;
@@ -90,6 +96,25 @@ void rds_send_reset(struct rds_connection *conn)
spin_unlock_irqrestore(&conn->c_lock, flags);
}
+static int acquire_in_xmit(struct rds_connection *conn)
+{
+ return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0;
+}
+
+static void release_in_xmit(struct rds_connection *conn)
+{
+ clear_bit(RDS_IN_XMIT, &conn->c_flags);
+ smp_mb__after_clear_bit();
+ /*
+ * We don't use wait_on_bit()/wake_up_bit() because our waking is in a
+ * hot path and finding waiters is very rare. We don't want to walk
+ * the system-wide hashed waitqueue buckets in the fast path only to
+ * almost never find waiters.
+ */
+ if (waitqueue_active(&conn->c_waitq))
+ wake_up_all(&conn->c_waitq);
+}
+
/*
* We're making the concious trade-off here to only send one message
* down the connection at a time.
@@ -109,102 +134,69 @@ int rds_send_xmit(struct rds_connection *conn)
struct rds_message *rm;
unsigned long flags;
unsigned int tmp;
- unsigned int send_quota = send_batch_count;
struct scatterlist *sg;
int ret = 0;
- int was_empty = 0;
LIST_HEAD(to_be_dropped);
+restart:
+
/*
* sendmsg calls here after having queued its message on the send
* queue. We only have one task feeding the connection at a time. If
* another thread is already feeding the queue then we back off. This
* avoids blocking the caller and trading per-connection data between
* caches per message.
- *
- * The sem holder will issue a retry if they notice that someone queued
- * a message after they stopped walking the send queue but before they
- * dropped the sem.
*/
- if (!mutex_trylock(&conn->c_send_lock)) {
- rds_stats_inc(s_send_sem_contention);
+ if (!acquire_in_xmit(conn)) {
+ rds_stats_inc(s_send_lock_contention);
ret = -ENOMEM;
goto out;
}
+ /*
+ * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
+ * we do the opposite to avoid races.
+ */
+ if (!rds_conn_up(conn)) {
+ release_in_xmit(conn);
+ ret = 0;
+ goto out;
+ }
+
if (conn->c_trans->xmit_prepare)
conn->c_trans->xmit_prepare(conn);
/*
* spin trying to push headers and data down the connection until
- * the connection doens't make forward progress.
+ * the connection doesn't make forward progress.
*/
- while (--send_quota) {
- /*
- * See if need to send a congestion map update if we're
- * between sending messages. The send_sem protects our sole
- * use of c_map_offset and _bytes.
- * Note this is used only by transports that define a special
- * xmit_cong_map function. For all others, we create allocate
- * a cong_map message and treat it just like any other send.
- */
- if (conn->c_map_bytes) {
- ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
- conn->c_map_offset);
- if (ret <= 0)
- break;
+ while (1) {
- conn->c_map_offset += ret;
- conn->c_map_bytes -= ret;
- if (conn->c_map_bytes)
- continue;
- }
-
- /* If we're done sending the current message, clear the
- * offset and S/G temporaries.
- */
rm = conn->c_xmit_rm;
- if (rm != NULL &&
- conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
- conn->c_xmit_sg == rm->m_nents) {
- conn->c_xmit_rm = NULL;
- conn->c_xmit_sg = 0;
- conn->c_xmit_hdr_off = 0;
- conn->c_xmit_data_off = 0;
- conn->c_xmit_rdma_sent = 0;
-
- /* Release the reference to the previous message. */
- rds_message_put(rm);
- rm = NULL;
- }
- /* If we're asked to send a cong map update, do so.
+ /*
+ * If between sending messages, we can send a pending congestion
+ * map update.
*/
- if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) {
- if (conn->c_trans->xmit_cong_map != NULL) {
- conn->c_map_offset = 0;
- conn->c_map_bytes = sizeof(struct rds_header) +
- RDS_CONG_MAP_BYTES;
- continue;
- }
-
+ if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
rm = rds_cong_update_alloc(conn);
if (IS_ERR(rm)) {
ret = PTR_ERR(rm);
break;
}
+ rm->data.op_active = 1;
conn->c_xmit_rm = rm;
}
/*
- * Grab the next message from the send queue, if there is one.
+ * If not already working on one, grab the next message.
*
* c_xmit_rm holds a ref while we're sending this message down
* the connction. We can use this ref while holding the
* send_sem.. rds_send_reset() is serialized with it.
*/
- if (rm == NULL) {
+ if (!rm) {
unsigned int len;
spin_lock_irqsave(&conn->c_lock, flags);
@@ -224,10 +216,8 @@ int rds_send_xmit(struct rds_connection *conn)
spin_unlock_irqrestore(&conn->c_lock, flags);
- if (rm == NULL) {
- was_empty = 1;
+ if (!rm)
break;
- }
/* Unfortunately, the way Infiniband deals with
* RDMA to a bad MR key is by moving the entire
@@ -236,13 +226,12 @@ int rds_send_xmit(struct rds_connection *conn)
* connection.
* Therefore, we never retransmit messages with RDMA ops.
*/
- if (rm->m_rdma_op &&
+ if (rm->rdma.op_active &&
test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
spin_lock_irqsave(&conn->c_lock, flags);
if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
list_move(&rm->m_conn_item, &to_be_dropped);
spin_unlock_irqrestore(&conn->c_lock, flags);
- rds_message_put(rm);
continue;
}
@@ -263,23 +252,55 @@ int rds_send_xmit(struct rds_connection *conn)
conn->c_xmit_rm = rm;
}
- /*
- * Try and send an rdma message. Let's see if we can
- * keep this simple and require that the transport either
- * send the whole rdma or none of it.
- */
- if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
- ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
+ /* The transport either sends the whole rdma or none of it */
+ if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
+ rm->m_final_op = &rm->rdma;
+ ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
if (ret)
break;
conn->c_xmit_rdma_sent = 1;
+
/* The transport owns the mapped memory for now.
* You can't unmap it while it's on the send queue */
set_bit(RDS_MSG_MAPPED, &rm->m_flags);
}
- if (conn->c_xmit_hdr_off < sizeof(struct rds_header) ||
- conn->c_xmit_sg < rm->m_nents) {
+ if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
+ rm->m_final_op = &rm->atomic;
+ ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
+ if (ret)
+ break;
+ conn->c_xmit_atomic_sent = 1;
+
+ /* The transport owns the mapped memory for now.
+ * You can't unmap it while it's on the send queue */
+ set_bit(RDS_MSG_MAPPED, &rm->m_flags);
+ }
+
+ /*
+ * A number of cases require an RDS header to be sent
+ * even if there is no data.
+ * We permit 0-byte sends; rds-ping depends on this.
+ * However, if there are exclusively attached silent ops,
+ * we skip the hdr/data send, to enable silent operation.
+ */
+ if (rm->data.op_nents == 0) {
+ int ops_present;
+ int all_ops_are_silent = 1;
+
+ ops_present = (rm->atomic.op_active || rm->rdma.op_active);
+ if (rm->atomic.op_active && !rm->atomic.op_silent)
+ all_ops_are_silent = 0;
+ if (rm->rdma.op_active && !rm->rdma.op_silent)
+ all_ops_are_silent = 0;
+
+ if (ops_present && all_ops_are_silent
+ && !rm->m_rdma_cookie)
+ rm->data.op_active = 0;
+ }
+
+ if (rm->data.op_active && !conn->c_xmit_data_sent) {
+ rm->m_final_op = &rm->data;
ret = conn->c_trans->xmit(conn, rm,
conn->c_xmit_hdr_off,
conn->c_xmit_sg,
@@ -295,7 +316,7 @@ int rds_send_xmit(struct rds_connection *conn)
ret -= tmp;
}
- sg = &rm->m_sg[conn->c_xmit_sg];
+ sg = &rm->data.op_sg[conn->c_xmit_sg];
while (ret) {
tmp = min_t(int, ret, sg->length -
conn->c_xmit_data_off);
@@ -306,49 +327,63 @@ int rds_send_xmit(struct rds_connection *conn)
sg++;
conn->c_xmit_sg++;
BUG_ON(ret != 0 &&
- conn->c_xmit_sg == rm->m_nents);
+ conn->c_xmit_sg == rm->data.op_nents);
}
}
+
+ if (conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
+ (conn->c_xmit_sg == rm->data.op_nents))
+ conn->c_xmit_data_sent = 1;
}
- }
- /* Nuke any messages we decided not to retransmit. */
- if (!list_empty(&to_be_dropped))
- rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
+ /*
+ * A rm will only take multiple times through this loop
+ * if there is a data op. Thus, if the data is sent (or there was
+ * none), then we're done with the rm.
+ */
+ if (!rm->data.op_active || conn->c_xmit_data_sent) {
+ conn->c_xmit_rm = NULL;
+ conn->c_xmit_sg = 0;
+ conn->c_xmit_hdr_off = 0;
+ conn->c_xmit_data_off = 0;
+ conn->c_xmit_rdma_sent = 0;
+ conn->c_xmit_atomic_sent = 0;
+ conn->c_xmit_data_sent = 0;
+
+ rds_message_put(rm);
+ }
+ }
if (conn->c_trans->xmit_complete)
conn->c_trans->xmit_complete(conn);
- /*
- * We might be racing with another sender who queued a message but
- * backed off on noticing that we held the c_send_lock. If we check
- * for queued messages after dropping the sem then either we'll
- * see the queued message or the queuer will get the sem. If we
- * notice the queued message then we trigger an immediate retry.
- *
- * We need to be careful only to do this when we stopped processing
- * the send queue because it was empty. It's the only way we
- * stop processing the loop when the transport hasn't taken
- * responsibility for forward progress.
- */
- mutex_unlock(&conn->c_send_lock);
+ release_in_xmit(conn);
- if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) {
- /* We exhausted the send quota, but there's work left to
- * do. Return and (re-)schedule the send worker.
- */
- ret = -EAGAIN;
+ /* Nuke any messages we decided not to retransmit. */
+ if (!list_empty(&to_be_dropped)) {
+ /* irqs on here, so we can put(), unlike above */
+ list_for_each_entry(rm, &to_be_dropped, m_conn_item)
+ rds_message_put(rm);
+ rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
}
- if (ret == 0 && was_empty) {
- /* A simple bit test would be way faster than taking the
- * spin lock */
- spin_lock_irqsave(&conn->c_lock, flags);
+ /*
+ * Other senders can queue a message after we last test the send queue
+ * but before we clear RDS_IN_XMIT. In that case they'd back off and
+ * not try and send their newly queued message. We need to check the
+ * send queue after having cleared RDS_IN_XMIT so that their message
+ * doesn't get stuck on the send queue.
+ *
+ * If the transport cannot continue (i.e ret != 0), then it must
+ * call us when more room is available, such as from the tx
+ * completion handler.
+ */
+ if (ret == 0) {
+ smp_mb();
if (!list_empty(&conn->c_send_queue)) {
- rds_stats_inc(s_send_sem_queue_raced);
- ret = -EAGAIN;
+ rds_stats_inc(s_send_lock_queue_raced);
+ goto restart;
}
- spin_unlock_irqrestore(&conn->c_lock, flags);
}
out:
return ret;
@@ -376,52 +411,60 @@ static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
}
/*
- * Returns true if there are no messages on the send and retransmit queues
- * which have a sequence number greater than or equal to the given sequence
- * number.
+ * This is pretty similar to what happens below in the ACK
+ * handling code - except that we call here as soon as we get
+ * the IB send completion on the RDMA op and the accompanying
+ * message.
*/
-int rds_send_acked_before(struct rds_connection *conn, u64 seq)
+void rds_rdma_send_complete(struct rds_message *rm, int status)
{
- struct rds_message *rm, *tmp;
- int ret = 1;
+ struct rds_sock *rs = NULL;
+ struct rm_rdma_op *ro;
+ struct rds_notifier *notifier;
+ unsigned long flags;
- spin_lock(&conn->c_lock);
+ spin_lock_irqsave(&rm->m_rs_lock, flags);
- list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
- if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
- ret = 0;
- break;
- }
+ ro = &rm->rdma;
+ if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
+ ro->op_active && ro->op_notify && ro->op_notifier) {
+ notifier = ro->op_notifier;
+ rs = rm->m_rs;
+ sock_hold(rds_rs_to_sk(rs));
- list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
- if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
- ret = 0;
- break;
+ notifier->n_status = status;
+ spin_lock(&rs->rs_lock);
+ list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
+ spin_unlock(&rs->rs_lock);
+
+ ro->op_notifier = NULL;
}
- spin_unlock(&conn->c_lock);
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags);
- return ret;
+ if (rs) {
+ rds_wake_sk_sleep(rs);
+ sock_put(rds_rs_to_sk(rs));
+ }
}
+EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
/*
- * This is pretty similar to what happens below in the ACK
- * handling code - except that we call here as soon as we get
- * the IB send completion on the RDMA op and the accompanying
- * message.
+ * Just like above, except looks at atomic op
*/
-void rds_rdma_send_complete(struct rds_message *rm, int status)
+void rds_atomic_send_complete(struct rds_message *rm, int status)
{
struct rds_sock *rs = NULL;
- struct rds_rdma_op *ro;
+ struct rm_atomic_op *ao;
struct rds_notifier *notifier;
+ unsigned long flags;
- spin_lock(&rm->m_rs_lock);
+ spin_lock_irqsave(&rm->m_rs_lock, flags);
- ro = rm->m_rdma_op;
- if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
- ro && ro->r_notify && ro->r_notifier) {
- notifier = ro->r_notifier;
+ ao = &rm->atomic;
+ if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
+ && ao->op_active && ao->op_notify && ao->op_notifier) {
+ notifier = ao->op_notifier;
rs = rm->m_rs;
sock_hold(rds_rs_to_sk(rs));
@@ -430,17 +473,17 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
spin_unlock(&rs->rs_lock);
- ro->r_notifier = NULL;
+ ao->op_notifier = NULL;
}
- spin_unlock(&rm->m_rs_lock);
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags);
if (rs) {
rds_wake_sk_sleep(rs);
sock_put(rds_rs_to_sk(rs));
}
}
-EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
+EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
/*
* This is the same as rds_rdma_send_complete except we
@@ -448,15 +491,23 @@ EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
* socket, socket lock) and can just move the notifier.
*/
static inline void
-__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
+__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
{
- struct rds_rdma_op *ro;
+ struct rm_rdma_op *ro;
+ struct rm_atomic_op *ao;
+
+ ro = &rm->rdma;
+ if (ro->op_active && ro->op_notify && ro->op_notifier) {
+ ro->op_notifier->n_status = status;
+ list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
+ ro->op_notifier = NULL;
+ }
- ro = rm->m_rdma_op;
- if (ro && ro->r_notify && ro->r_notifier) {
- ro->r_notifier->n_status = status;
- list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
- ro->r_notifier = NULL;
+ ao = &rm->atomic;
+ if (ao->op_active && ao->op_notify && ao->op_notifier) {
+ ao->op_notifier->n_status = status;
+ list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue);
+ ao->op_notifier = NULL;
}
/* No need to wake the app - caller does this */
@@ -468,7 +519,7 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status
* So speed is not an issue here.
*/
struct rds_message *rds_send_get_message(struct rds_connection *conn,
- struct rds_rdma_op *op)
+ struct rm_rdma_op *op)
{
struct rds_message *rm, *tmp, *found = NULL;
unsigned long flags;
@@ -476,7 +527,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
spin_lock_irqsave(&conn->c_lock, flags);
list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
- if (rm->m_rdma_op == op) {
+ if (&rm->rdma == op) {
atomic_inc(&rm->m_refcount);
found = rm;
goto out;
@@ -484,7 +535,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
}
list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
- if (rm->m_rdma_op == op) {
+ if (&rm->rdma == op) {
atomic_inc(&rm->m_refcount);
found = rm;
break;
@@ -506,7 +557,7 @@ EXPORT_SYMBOL_GPL(rds_send_get_message);
* removing the messages from the 'messages' list regardless of if it found
* the messages on the socket list or not.
*/
-void rds_send_remove_from_sock(struct list_head *messages, int status)
+static void rds_send_remove_from_sock(struct list_head *messages, int status)
{
unsigned long flags;
struct rds_sock *rs = NULL;
@@ -544,19 +595,20 @@ void rds_send_remove_from_sock(struct list_head *messages, int status)
spin_lock(&rs->rs_lock);
if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
- struct rds_rdma_op *ro = rm->m_rdma_op;
+ struct rm_rdma_op *ro = &rm->rdma;
struct rds_notifier *notifier;
list_del_init(&rm->m_sock_item);
rds_send_sndbuf_remove(rs, rm);
- if (ro && ro->r_notifier && (status || ro->r_notify)) {
- notifier = ro->r_notifier;
+ if (ro->op_active && ro->op_notifier &&
+ (ro->op_notify || (ro->op_recverr && status))) {
+ notifier = ro->op_notifier;
list_add_tail(&notifier->n_list,
&rs->rs_notify_queue);
if (!notifier->n_status)
notifier->n_status = status;
- rm->m_rdma_op->r_notifier = NULL;
+ rm->rdma.op_notifier = NULL;
}
was_on_sock = 1;
rm->m_rs = NULL;
@@ -619,9 +671,8 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
{
struct rds_message *rm, *tmp;
struct rds_connection *conn;
- unsigned long flags, flags2;
+ unsigned long flags;
LIST_HEAD(list);
- int wake = 0;
/* get all the messages we're dropping under the rs lock */
spin_lock_irqsave(&rs->rs_lock, flags);
@@ -631,59 +682,54 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
dest->sin_port != rm->m_inc.i_hdr.h_dport))
continue;
- wake = 1;
list_move(&rm->m_sock_item, &list);
rds_send_sndbuf_remove(rs, rm);
clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
}
/* order flag updates with the rs lock */
- if (wake)
- smp_mb__after_clear_bit();
+ smp_mb__after_clear_bit();
spin_unlock_irqrestore(&rs->rs_lock, flags);
- conn = NULL;
+ if (list_empty(&list))
+ return;
- /* now remove the messages from the conn list as needed */
+ /* Remove the messages from the conn */
list_for_each_entry(rm, &list, m_sock_item) {
- /* We do this here rather than in the loop above, so that
- * we don't have to nest m_rs_lock under rs->rs_lock */
- spin_lock_irqsave(&rm->m_rs_lock, flags2);
- /* If this is a RDMA operation, notify the app. */
- spin_lock(&rs->rs_lock);
- __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
- spin_unlock(&rs->rs_lock);
- rm->m_rs = NULL;
- spin_unlock_irqrestore(&rm->m_rs_lock, flags2);
+ conn = rm->m_inc.i_conn;
+
+ spin_lock_irqsave(&conn->c_lock, flags);
/*
- * If we see this flag cleared then we're *sure* that someone
- * else beat us to removing it from the conn. If we race
- * with their flag update we'll get the lock and then really
- * see that the flag has been cleared.
+ * Maybe someone else beat us to removing rm from the conn.
+ * If we race with their flag update we'll get the lock and
+ * then really see that the flag has been cleared.
*/
- if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags))
+ if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
+ spin_unlock_irqrestore(&conn->c_lock, flags);
continue;
-
- if (conn != rm->m_inc.i_conn) {
- if (conn)
- spin_unlock_irqrestore(&conn->c_lock, flags);
- conn = rm->m_inc.i_conn;
- spin_lock_irqsave(&conn->c_lock, flags);
}
+ list_del_init(&rm->m_conn_item);
+ spin_unlock_irqrestore(&conn->c_lock, flags);
- if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
- list_del_init(&rm->m_conn_item);
- rds_message_put(rm);
- }
- }
+ /*
+ * Couldn't grab m_rs_lock in top loop (lock ordering),
+ * but we can now.
+ */
+ spin_lock_irqsave(&rm->m_rs_lock, flags);
- if (conn)
- spin_unlock_irqrestore(&conn->c_lock, flags);
+ spin_lock(&rs->rs_lock);
+ __rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
+ spin_unlock(&rs->rs_lock);
- if (wake)
- rds_wake_sk_sleep(rs);
+ rm->m_rs = NULL;
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+ rds_message_put(rm);
+ }
+
+ rds_wake_sk_sleep(rs);
while (!list_empty(&list)) {
rm = list_entry(list.next, struct rds_message, m_sock_item);
@@ -763,6 +809,63 @@ out:
return *queued;
}
+/*
+ * rds_message is getting to be quite complicated, and we'd like to allocate
+ * it all in one go. This figures out how big it needs to be up front.
+ */
+static int rds_rm_size(struct msghdr *msg, int data_len)
+{
+ struct cmsghdr *cmsg;
+ int size = 0;
+ int cmsg_groups = 0;
+ int retval;
+
+ for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_RDS)
+ continue;
+
+ switch (cmsg->cmsg_type) {
+ case RDS_CMSG_RDMA_ARGS:
+ cmsg_groups |= 1;
+ retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
+ if (retval < 0)
+ return retval;
+ size += retval;
+
+ break;
+
+ case RDS_CMSG_RDMA_DEST:
+ case RDS_CMSG_RDMA_MAP:
+ cmsg_groups |= 2;
+ /* these are valid but do no add any size */
+ break;
+
+ case RDS_CMSG_ATOMIC_CSWP:
+ case RDS_CMSG_ATOMIC_FADD:
+ case RDS_CMSG_MASKED_ATOMIC_CSWP:
+ case RDS_CMSG_MASKED_ATOMIC_FADD:
+ cmsg_groups |= 1;
+ size += sizeof(struct scatterlist);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ }
+
+ size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
+
+ /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
+ if (cmsg_groups == 3)
+ return -EINVAL;
+
+ return size;
+}
+
static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
struct msghdr *msg, int *allocated_mr)
{
@@ -777,7 +880,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
continue;
/* As a side effect, RDMA_DEST and RDMA_MAP will set
- * rm->m_rdma_cookie and rm->m_rdma_mr.
+ * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
*/
switch (cmsg->cmsg_type) {
case RDS_CMSG_RDMA_ARGS:
@@ -793,6 +896,12 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
if (!ret)
*allocated_mr = 1;
break;
+ case RDS_CMSG_ATOMIC_CSWP:
+ case RDS_CMSG_ATOMIC_FADD:
+ case RDS_CMSG_MASKED_ATOMIC_CSWP:
+ case RDS_CMSG_MASKED_ATOMIC_FADD:
+ ret = rds_cmsg_atomic(rs, rm, cmsg);
+ break;
default:
return -EINVAL;
@@ -850,13 +959,26 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
goto out;
}
- rm = rds_message_copy_from_user(msg->msg_iov, payload_len);
- if (IS_ERR(rm)) {
- ret = PTR_ERR(rm);
- rm = NULL;
+ /* size of rm including all sgs */
+ ret = rds_rm_size(msg, payload_len);
+ if (ret < 0)
+ goto out;
+
+ rm = rds_message_alloc(ret, GFP_KERNEL);
+ if (!rm) {
+ ret = -ENOMEM;
goto out;
}
+ /* Attach data to the rm */
+ if (payload_len) {
+ rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
+ ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len);
+ if (ret)
+ goto out;
+ }
+ rm->data.op_active = 1;
+
rm->m_daddr = daddr;
/* rds_conn_create has a spinlock that runs with IRQ off.
@@ -879,22 +1001,23 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
if (ret)
goto out;
- if ((rm->m_rdma_cookie || rm->m_rdma_op) &&
- conn->c_trans->xmit_rdma == NULL) {
+ if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
if (printk_ratelimit())
printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
- rm->m_rdma_op, conn->c_trans->xmit_rdma);
+ &rm->rdma, conn->c_trans->xmit_rdma);
ret = -EOPNOTSUPP;
goto out;
}
- /* If the connection is down, trigger a connect. We may
- * have scheduled a delayed reconnect however - in this case
- * we should not interfere.
- */
- if (rds_conn_state(conn) == RDS_CONN_DOWN &&
- !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
- queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+ if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
+ if (printk_ratelimit())
+ printk(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
+ &rm->atomic, conn->c_trans->xmit_atomic);
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ rds_conn_connect_if_down(conn);
ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
if (ret) {
@@ -938,7 +1061,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
rds_stats_inc(s_send_queued);
if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
- rds_send_worker(&conn->c_send_w.work);
+ rds_send_xmit(conn);
rds_message_put(rm);
return payload_len;
@@ -966,20 +1089,15 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
int ret = 0;
rm = rds_message_alloc(0, GFP_ATOMIC);
- if (rm == NULL) {
+ if (!rm) {
ret = -ENOMEM;
goto out;
}
rm->m_daddr = conn->c_faddr;
+ rm->data.op_active = 1;
- /* If the connection is down, trigger a connect. We may
- * have scheduled a delayed reconnect however - in this case
- * we should not interfere.
- */
- if (rds_conn_state(conn) == RDS_CONN_DOWN &&
- !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
- queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+ rds_conn_connect_if_down(conn);
ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
if (ret)
@@ -999,7 +1117,9 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
rds_stats_inc(s_send_queued);
rds_stats_inc(s_send_pong);
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+ rds_send_xmit(conn);
+
rds_message_put(rm);
return 0;
diff --git a/net/rds/stats.c b/net/rds/stats.c
index 7598eb07cfb..10c759ccac0 100644
--- a/net/rds/stats.c
+++ b/net/rds/stats.c
@@ -57,8 +57,8 @@ static const char *const rds_stat_names[] = {
"recv_ping",
"send_queue_empty",
"send_queue_full",
- "send_sem_contention",
- "send_sem_queue_raced",
+ "send_lock_contention",
+ "send_lock_queue_raced",
"send_immediate_retry",
"send_delayed_retry",
"send_drop_acked",
@@ -143,7 +143,7 @@ void rds_stats_exit(void)
rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
}
-int __init rds_stats_init(void)
+int rds_stats_init(void)
{
rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
return 0;
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
index 7829a20325d..25ad0c77a26 100644
--- a/net/rds/sysctl.c
+++ b/net/rds/sysctl.c
@@ -105,13 +105,13 @@ void rds_sysctl_exit(void)
unregister_sysctl_table(rds_sysctl_reg_table);
}
-int __init rds_sysctl_init(void)
+int rds_sysctl_init(void)
{
rds_sysctl_reconnect_min = msecs_to_jiffies(1);
rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table);
- if (rds_sysctl_reg_table == NULL)
+ if (!rds_sysctl_reg_table)
return -ENOMEM;
return 0;
}
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index babf4577ff7..08a8c6cf2d1 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -41,7 +41,7 @@
/* only for info exporting */
static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
static LIST_HEAD(rds_tcp_tc_list);
-unsigned int rds_tcp_tc_count;
+static unsigned int rds_tcp_tc_count;
/* Track rds_tcp_connection structs so they can be cleaned up */
static DEFINE_SPINLOCK(rds_tcp_conn_lock);
@@ -200,7 +200,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
struct rds_tcp_connection *tc;
tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
- if (tc == NULL)
+ if (!tc)
return -ENOMEM;
tc->t_sock = NULL;
@@ -243,7 +243,7 @@ static void rds_tcp_destroy_conns(void)
}
}
-void rds_tcp_exit(void)
+static void rds_tcp_exit(void)
{
rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
rds_tcp_listen_stop();
@@ -258,7 +258,6 @@ struct rds_transport rds_tcp_transport = {
.laddr_check = rds_tcp_laddr_check,
.xmit_prepare = rds_tcp_xmit_prepare,
.xmit_complete = rds_tcp_xmit_complete,
- .xmit_cong_map = rds_tcp_xmit_cong_map,
.xmit = rds_tcp_xmit,
.recv = rds_tcp_recv,
.conn_alloc = rds_tcp_conn_alloc,
@@ -266,7 +265,6 @@ struct rds_transport rds_tcp_transport = {
.conn_connect = rds_tcp_conn_connect,
.conn_shutdown = rds_tcp_conn_shutdown,
.inc_copy_to_user = rds_tcp_inc_copy_to_user,
- .inc_purge = rds_tcp_inc_purge,
.inc_free = rds_tcp_inc_free,
.stats_info_copy = rds_tcp_stats_info_copy,
.exit = rds_tcp_exit,
@@ -276,14 +274,14 @@ struct rds_transport rds_tcp_transport = {
.t_prefer_loopback = 1,
};
-int __init rds_tcp_init(void)
+static int rds_tcp_init(void)
{
int ret;
rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
sizeof(struct rds_tcp_connection),
0, 0, NULL);
- if (rds_tcp_conn_slab == NULL) {
+ if (!rds_tcp_conn_slab) {
ret = -ENOMEM;
goto out;
}
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 844fa6b9cf5..9cf2927d002 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -43,8 +43,6 @@ struct rds_tcp_statistics {
};
/* tcp.c */
-int __init rds_tcp_init(void);
-void rds_tcp_exit(void);
void rds_tcp_tune(struct socket *sock);
void rds_tcp_nonagle(struct socket *sock);
void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
@@ -61,16 +59,15 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn);
void rds_tcp_state_change(struct sock *sk);
/* tcp_listen.c */
-int __init rds_tcp_listen_init(void);
+int rds_tcp_listen_init(void);
void rds_tcp_listen_stop(void);
void rds_tcp_listen_data_ready(struct sock *sk, int bytes);
/* tcp_recv.c */
-int __init rds_tcp_recv_init(void);
+int rds_tcp_recv_init(void);
void rds_tcp_recv_exit(void);
void rds_tcp_data_ready(struct sock *sk, int bytes);
int rds_tcp_recv(struct rds_connection *conn);
-void rds_tcp_inc_purge(struct rds_incoming *inc);
void rds_tcp_inc_free(struct rds_incoming *inc);
int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
size_t size);
@@ -81,8 +78,6 @@ void rds_tcp_xmit_complete(struct rds_connection *conn);
int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off);
void rds_tcp_write_space(struct sock *sk);
-int rds_tcp_xmit_cong_map(struct rds_connection *conn,
- struct rds_cong_map *map, unsigned long offset);
/* tcp_stats.c */
DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index c397524c039..af95c8e058f 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -43,9 +43,9 @@ void rds_tcp_state_change(struct sock *sk)
struct rds_connection *conn;
struct rds_tcp_connection *tc;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
conn = sk->sk_user_data;
- if (conn == NULL) {
+ if (!conn) {
state_change = sk->sk_state_change;
goto out;
}
@@ -68,7 +68,7 @@ void rds_tcp_state_change(struct sock *sk)
break;
}
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
state_change(sk);
}
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 975183fe695..8b5cc4aa886 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -114,9 +114,9 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
rdsdebug("listen data ready sk %p\n", sk);
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
ready = sk->sk_user_data;
- if (ready == NULL) { /* check for teardown race */
+ if (!ready) { /* check for teardown race */
ready = sk->sk_data_ready;
goto out;
}
@@ -131,11 +131,11 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
queue_work(rds_wq, &rds_tcp_listen_work);
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
ready(sk, bytes);
}
-int __init rds_tcp_listen_init(void)
+int rds_tcp_listen_init(void)
{
struct sockaddr_in sin;
struct socket *sock = NULL;
@@ -178,7 +178,7 @@ void rds_tcp_listen_stop(void)
struct socket *sock = rds_tcp_listen_sock;
struct sock *sk;
- if (sock == NULL)
+ if (!sock)
return;
sk = sock->sk;
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index 1aba6878fa5..78205e25500 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -39,7 +39,7 @@
static struct kmem_cache *rds_tcp_incoming_slab;
-void rds_tcp_inc_purge(struct rds_incoming *inc)
+static void rds_tcp_inc_purge(struct rds_incoming *inc)
{
struct rds_tcp_incoming *tinc;
tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
@@ -190,10 +190,10 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
* processing.
*/
while (left) {
- if (tinc == NULL) {
+ if (!tinc) {
tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
arg->gfp);
- if (tinc == NULL) {
+ if (!tinc) {
desc->error = -ENOMEM;
goto out;
}
@@ -229,7 +229,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
if (left && tc->t_tinc_data_rem) {
clone = skb_clone(skb, arg->gfp);
- if (clone == NULL) {
+ if (!clone) {
desc->error = -ENOMEM;
goto out;
}
@@ -272,7 +272,8 @@ out:
}
/* the caller has to hold the sock lock */
-int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, enum km_type km)
+static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp,
+ enum km_type km)
{
struct rds_tcp_connection *tc = conn->c_transport_data;
struct socket *sock = tc->t_sock;
@@ -324,9 +325,9 @@ void rds_tcp_data_ready(struct sock *sk, int bytes)
rdsdebug("data ready sk %p bytes %d\n", sk, bytes);
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
conn = sk->sk_user_data;
- if (conn == NULL) { /* check for teardown race */
+ if (!conn) { /* check for teardown race */
ready = sk->sk_data_ready;
goto out;
}
@@ -338,16 +339,16 @@ void rds_tcp_data_ready(struct sock *sk, int bytes)
if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM)
queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
ready(sk, bytes);
}
-int __init rds_tcp_recv_init(void)
+int rds_tcp_recv_init(void)
{
rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
sizeof(struct rds_tcp_incoming),
0, 0, NULL);
- if (rds_tcp_incoming_slab == NULL)
+ if (!rds_tcp_incoming_slab)
return -ENOMEM;
return 0;
}
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index a28b895ff0d..1b4fd68f0c7 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -63,7 +63,7 @@ void rds_tcp_xmit_complete(struct rds_connection *conn)
}
/* the core send_sem serializes this with other xmit and shutdown */
-int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
+static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
{
struct kvec vec = {
.iov_base = data,
@@ -77,56 +77,6 @@ int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
}
/* the core send_sem serializes this with other xmit and shutdown */
-int rds_tcp_xmit_cong_map(struct rds_connection *conn,
- struct rds_cong_map *map, unsigned long offset)
-{
- static struct rds_header rds_tcp_map_header = {
- .h_flags = RDS_FLAG_CONG_BITMAP,
- };
- struct rds_tcp_connection *tc = conn->c_transport_data;
- unsigned long i;
- int ret;
- int copied = 0;
-
- /* Some problem claims cpu_to_be32(constant) isn't a constant. */
- rds_tcp_map_header.h_len = cpu_to_be32(RDS_CONG_MAP_BYTES);
-
- if (offset < sizeof(struct rds_header)) {
- ret = rds_tcp_sendmsg(tc->t_sock,
- (void *)&rds_tcp_map_header + offset,
- sizeof(struct rds_header) - offset);
- if (ret <= 0)
- return ret;
- offset += ret;
- copied = ret;
- if (offset < sizeof(struct rds_header))
- return ret;
- }
-
- offset -= sizeof(struct rds_header);
- i = offset / PAGE_SIZE;
- offset = offset % PAGE_SIZE;
- BUG_ON(i >= RDS_CONG_MAP_PAGES);
-
- do {
- ret = tc->t_sock->ops->sendpage(tc->t_sock,
- virt_to_page(map->m_page_addrs[i]),
- offset, PAGE_SIZE - offset,
- MSG_DONTWAIT);
- if (ret <= 0)
- break;
- copied += ret;
- offset += ret;
- if (offset == PAGE_SIZE) {
- offset = 0;
- i++;
- }
- } while (i < RDS_CONG_MAP_PAGES);
-
- return copied ? copied : ret;
-}
-
-/* the core send_sem serializes this with other xmit and shutdown */
int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off)
{
@@ -166,21 +116,21 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
goto out;
}
- while (sg < rm->m_nents) {
+ while (sg < rm->data.op_nents) {
ret = tc->t_sock->ops->sendpage(tc->t_sock,
- sg_page(&rm->m_sg[sg]),
- rm->m_sg[sg].offset + off,
- rm->m_sg[sg].length - off,
+ sg_page(&rm->data.op_sg[sg]),
+ rm->data.op_sg[sg].offset + off,
+ rm->data.op_sg[sg].length - off,
MSG_DONTWAIT|MSG_NOSIGNAL);
- rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->m_sg[sg]),
- rm->m_sg[sg].offset + off, rm->m_sg[sg].length - off,
+ rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
+ rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
ret);
if (ret <= 0)
break;
off += ret;
done += ret;
- if (off == rm->m_sg[sg].length) {
+ if (off == rm->data.op_sg[sg].length) {
off = 0;
sg++;
}
@@ -224,9 +174,9 @@ void rds_tcp_write_space(struct sock *sk)
struct rds_connection *conn;
struct rds_tcp_connection *tc;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
conn = sk->sk_user_data;
- if (conn == NULL) {
+ if (!conn) {
write_space = sk->sk_write_space;
goto out;
}
@@ -244,7 +194,7 @@ void rds_tcp_write_space(struct sock *sk)
queue_delayed_work(rds_wq, &conn->c_send_w, 0);
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
/*
* write_space is only called when data leaves tcp's send queue if
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 786c20eaaf5..0fd90f8c5f5 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -61,7 +61,7 @@
*
* Transition to state DISCONNECTING/DOWN:
* - Inside the shutdown worker; synchronizes with xmit path
- * through c_send_lock, and with connection management callbacks
+ * through RDS_IN_XMIT, and with connection management callbacks
* via c_cm_lock.
*
* For receive callbacks, we rely on the underlying transport
@@ -110,7 +110,7 @@ EXPORT_SYMBOL_GPL(rds_connect_complete);
* We should *always* start with a random backoff; otherwise a broken connection
* will always take several iterations to be re-established.
*/
-static void rds_queue_reconnect(struct rds_connection *conn)
+void rds_queue_reconnect(struct rds_connection *conn)
{
unsigned long rand;
@@ -156,58 +156,6 @@ void rds_connect_worker(struct work_struct *work)
}
}
-void rds_shutdown_worker(struct work_struct *work)
-{
- struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
-
- /* shut it down unless it's down already */
- if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
- /*
- * Quiesce the connection mgmt handlers before we start tearing
- * things down. We don't hold the mutex for the entire
- * duration of the shutdown operation, else we may be
- * deadlocking with the CM handler. Instead, the CM event
- * handler is supposed to check for state DISCONNECTING
- */
- mutex_lock(&conn->c_cm_lock);
- if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) &&
- !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
- rds_conn_error(conn, "shutdown called in state %d\n",
- atomic_read(&conn->c_state));
- mutex_unlock(&conn->c_cm_lock);
- return;
- }
- mutex_unlock(&conn->c_cm_lock);
-
- mutex_lock(&conn->c_send_lock);
- conn->c_trans->conn_shutdown(conn);
- rds_conn_reset(conn);
- mutex_unlock(&conn->c_send_lock);
-
- if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
- /* This can happen - eg when we're in the middle of tearing
- * down the connection, and someone unloads the rds module.
- * Quite reproduceable with loopback connections.
- * Mostly harmless.
- */
- rds_conn_error(conn,
- "%s: failed to transition to state DOWN, "
- "current state is %d\n",
- __func__,
- atomic_read(&conn->c_state));
- return;
- }
- }
-
- /* Then reconnect if it's still live.
- * The passive side of an IB loopback connection is never added
- * to the conn hash, so we never trigger a reconnect on this
- * conn - the reconnect is always triggered by the active peer. */
- cancel_delayed_work(&conn->c_conn_w);
- if (!hlist_unhashed(&conn->c_hash_node))
- rds_queue_reconnect(conn);
-}
-
void rds_send_worker(struct work_struct *work)
{
struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
@@ -252,15 +200,22 @@ void rds_recv_worker(struct work_struct *work)
}
}
+void rds_shutdown_worker(struct work_struct *work)
+{
+ struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
+
+ rds_conn_shutdown(conn);
+}
+
void rds_threads_exit(void)
{
destroy_workqueue(rds_wq);
}
-int __init rds_threads_init(void)
+int rds_threads_init(void)
{
- rds_wq = create_workqueue("krdsd");
- if (rds_wq == NULL)
+ rds_wq = create_singlethread_workqueue("krdsd");
+ if (!rds_wq)
return -ENOMEM;
return 0;
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 7e106790135..7f2ac4fec36 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -71,19 +71,28 @@ void rds_trans_unregister(struct rds_transport *trans)
}
EXPORT_SYMBOL_GPL(rds_trans_unregister);
+void rds_trans_put(struct rds_transport *trans)
+{
+ if (trans && trans->t_owner)
+ module_put(trans->t_owner);
+}
+
struct rds_transport *rds_trans_get_preferred(__be32 addr)
{
struct rds_transport *ret = NULL;
- int i;
+ struct rds_transport *trans;
+ unsigned int i;
if (IN_LOOPBACK(ntohl(addr)))
return &rds_loop_transport;
down_read(&rds_trans_sem);
- for (i = 0; i < RDS_TRANS_COUNT; i++)
- {
- if (transports[i] && (transports[i]->laddr_check(addr) == 0)) {
- ret = transports[i];
+ for (i = 0; i < RDS_TRANS_COUNT; i++) {
+ trans = transports[i];
+
+ if (trans && (trans->laddr_check(addr) == 0) &&
+ (!trans->t_owner || try_module_get(trans->t_owner))) {
+ ret = trans;
break;
}
}
diff --git a/net/rds/xlist.h b/net/rds/xlist.h
new file mode 100644
index 00000000000..e6b5190dadd
--- /dev/null
+++ b/net/rds/xlist.h
@@ -0,0 +1,80 @@
+#ifndef _LINUX_XLIST_H
+#define _LINUX_XLIST_H
+
+#include <linux/stddef.h>
+#include <linux/poison.h>
+#include <linux/prefetch.h>
+#include <asm/system.h>
+
+struct xlist_head {
+ struct xlist_head *next;
+};
+
+static inline void INIT_XLIST_HEAD(struct xlist_head *list)
+{
+ list->next = NULL;
+}
+
+static inline int xlist_empty(struct xlist_head *head)
+{
+ return head->next == NULL;
+}
+
+static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail,
+ struct xlist_head *head)
+{
+ struct xlist_head *cur;
+ struct xlist_head *check;
+
+ while (1) {
+ cur = head->next;
+ tail->next = cur;
+ check = cmpxchg(&head->next, cur, new);
+ if (check == cur)
+ break;
+ }
+}
+
+static inline struct xlist_head *xlist_del_head(struct xlist_head *head)
+{
+ struct xlist_head *cur;
+ struct xlist_head *check;
+ struct xlist_head *next;
+
+ while (1) {
+ cur = head->next;
+ if (!cur)
+ goto out;
+
+ next = cur->next;
+ check = cmpxchg(&head->next, cur, next);
+ if (check == cur)
+ goto out;
+ }
+out:
+ return cur;
+}
+
+static inline struct xlist_head *xlist_del_head_fast(struct xlist_head *head)
+{
+ struct xlist_head *cur;
+
+ cur = head->next;
+ if (!cur)
+ return NULL;
+
+ head->next = cur->next;
+ return cur;
+}
+
+static inline void xlist_splice(struct xlist_head *list,
+ struct xlist_head *head)
+{
+ struct xlist_head *cur;
+
+ WARN_ON(head->next);
+ cur = xchg(&list->next, NULL);
+ head->next = cur;
+}
+
+#endif
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 51875a0c5d4..04f599089e6 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -1241,6 +1241,7 @@ static const struct file_operations rfkill_fops = {
.unlocked_ioctl = rfkill_fop_ioctl,
.compat_ioctl = rfkill_fop_ioctl,
#endif
+ .llseek = no_llseek,
};
static struct miscdevice rfkill_miscdev = {
diff --git a/net/rfkill/input.c b/net/rfkill/input.c
index 3713d7ecab9..1bca6d49ec9 100644
--- a/net/rfkill/input.c
+++ b/net/rfkill/input.c
@@ -142,7 +142,7 @@ static unsigned long rfkill_last_scheduled;
static unsigned long rfkill_ratelimit(const unsigned long last)
{
const unsigned long delay = msecs_to_jiffies(RFKILL_OPS_DELAY);
- return (time_after(jiffies, last + delay)) ? 0 : delay;
+ return time_after(jiffies, last + delay) ? 0 : delay;
}
static void rfkill_schedule_ratelimited(void)
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 8e45e76a95f..d952e7eac18 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -679,7 +679,7 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1)
return -EINVAL;
- if (addr->srose_ndigis > ROSE_MAX_DIGIS)
+ if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS)
return -EINVAL;
if ((dev = rose_dev_get(&addr->srose_addr)) == NULL) {
@@ -739,7 +739,7 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1)
return -EINVAL;
- if (addr->srose_ndigis > ROSE_MAX_DIGIS)
+ if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS)
return -EINVAL;
/* Source + Destination digis should not exceed ROSE_MAX_DIGIS */
diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c
index a750a28e022..fa5f5641a2c 100644
--- a/net/rose/rose_link.c
+++ b/net/rose/rose_link.c
@@ -114,7 +114,7 @@ static int rose_send_frame(struct sk_buff *skb, struct rose_neigh *neigh)
if (ax25s)
ax25_cb_put(ax25s);
- return (neigh->ax25 != NULL);
+ return neigh->ax25 != NULL;
}
/*
@@ -137,7 +137,7 @@ static int rose_link_up(struct rose_neigh *neigh)
if (ax25s)
ax25_cb_put(ax25s);
- return (neigh->ax25 != NULL);
+ return neigh->ax25 != NULL;
}
/*
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 7043b294bb6..8e22bd345e7 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -597,12 +597,6 @@ extern unsigned rxrpc_debug;
#define dbgprintk(FMT,...) \
printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__)
-/* make sure we maintain the format strings, even when debugging is disabled */
-static inline __attribute__((format(printf,1,2)))
-void _dbprintk(const char *fmt, ...)
-{
-}
-
#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
#define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__)
@@ -655,11 +649,11 @@ do { \
} while (0)
#else
-#define _enter(FMT,...) _dbprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
-#define _leave(FMT,...) _dbprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
-#define _debug(FMT,...) _dbprintk(" "FMT ,##__VA_ARGS__)
-#define _proto(FMT,...) _dbprintk("### "FMT ,##__VA_ARGS__)
-#define _net(FMT,...) _dbprintk("@@@ "FMT ,##__VA_ARGS__)
+#define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
+#define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
+#define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__)
+#define _proto(FMT,...) no_printk("### "FMT ,##__VA_ARGS__)
+#define _net(FMT,...) no_printk("@@@ "FMT ,##__VA_ARGS__)
#endif
/*
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 2f691fb180d..a36270a994d 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -518,6 +518,16 @@ config NET_ACT_SKBEDIT
To compile this code as a module, choose M here: the
module will be called act_skbedit.
+config NET_ACT_CSUM
+ tristate "Checksum Updating"
+ depends on NET_CLS_ACT && INET
+ ---help---
+ Say Y here to update some common checksum after some direct
+ packet alterations.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_csum.
+
config NET_CLS_IND
bool "Incoming device classification"
depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index f14e71bfa58..960f5dba630 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_NET_ACT_NAT) += act_nat.o
obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o
obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o
obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o
+obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
new file mode 100644
index 00000000000..67dc7ce9b63
--- /dev/null
+++ b/net/sched/act_csum.c
@@ -0,0 +1,595 @@
+/*
+ * Checksum updating actions
+ *
+ * Copyright (c) 2010 Gregoire Baron <baronchon@n7mm.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+
+#include <linux/netlink.h>
+#include <net/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include <linux/skbuff.h>
+
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/igmp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/ip6_checksum.h>
+
+#include <net/act_api.h>
+
+#include <linux/tc_act/tc_csum.h>
+#include <net/tc_act/tc_csum.h>
+
+#define CSUM_TAB_MASK 15
+static struct tcf_common *tcf_csum_ht[CSUM_TAB_MASK + 1];
+static u32 csum_idx_gen;
+static DEFINE_RWLOCK(csum_lock);
+
+static struct tcf_hashinfo csum_hash_info = {
+ .htab = tcf_csum_ht,
+ .hmask = CSUM_TAB_MASK,
+ .lock = &csum_lock,
+};
+
+static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
+ [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
+};
+
+static int tcf_csum_init(struct nlattr *nla, struct nlattr *est,
+ struct tc_action *a, int ovr, int bind)
+{
+ struct nlattr *tb[TCA_CSUM_MAX + 1];
+ struct tc_csum *parm;
+ struct tcf_common *pc;
+ struct tcf_csum *p;
+ int ret = 0, err;
+
+ if (nla == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_CSUM_MAX, nla,csum_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CSUM_PARMS] == NULL)
+ return -EINVAL;
+ parm = nla_data(tb[TCA_CSUM_PARMS]);
+
+ pc = tcf_hash_check(parm->index, a, bind, &csum_hash_info);
+ if (!pc) {
+ pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
+ &csum_idx_gen, &csum_hash_info);
+ if (IS_ERR(pc))
+ return PTR_ERR(pc);
+ p = to_tcf_csum(pc);
+ ret = ACT_P_CREATED;
+ } else {
+ p = to_tcf_csum(pc);
+ if (!ovr) {
+ tcf_hash_release(pc, bind, &csum_hash_info);
+ return -EEXIST;
+ }
+ }
+
+ spin_lock_bh(&p->tcf_lock);
+ p->tcf_action = parm->action;
+ p->update_flags = parm->update_flags;
+ spin_unlock_bh(&p->tcf_lock);
+
+ if (ret == ACT_P_CREATED)
+ tcf_hash_insert(pc, &csum_hash_info);
+
+ return ret;
+}
+
+static int tcf_csum_cleanup(struct tc_action *a, int bind)
+{
+ struct tcf_csum *p = a->priv;
+ return tcf_hash_release(&p->common, bind, &csum_hash_info);
+}
+
+/**
+ * tcf_csum_skb_nextlayer - Get next layer pointer
+ * @skb: sk_buff to use
+ * @ihl: previous summed headers length
+ * @ipl: complete packet length
+ * @jhl: next header length
+ *
+ * Check the expected next layer availability in the specified sk_buff.
+ * Return the next layer pointer if pass, NULL otherwise.
+ */
+static void *tcf_csum_skb_nextlayer(struct sk_buff *skb,
+ unsigned int ihl, unsigned int ipl,
+ unsigned int jhl)
+{
+ int ntkoff = skb_network_offset(skb);
+ int hl = ihl + jhl;
+
+ if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) ||
+ (skb_cloned(skb) &&
+ !skb_clone_writable(skb, hl + ntkoff) &&
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+ return NULL;
+ else
+ return (void *)(skb_network_header(skb) + ihl);
+}
+
+static int tcf_csum_ipv4_icmp(struct sk_buff *skb,
+ unsigned int ihl, unsigned int ipl)
+{
+ struct icmphdr *icmph;
+
+ icmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmph));
+ if (icmph == NULL)
+ return 0;
+
+ icmph->checksum = 0;
+ skb->csum = csum_partial(icmph, ipl - ihl, 0);
+ icmph->checksum = csum_fold(skb->csum);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return 1;
+}
+
+static int tcf_csum_ipv4_igmp(struct sk_buff *skb,
+ unsigned int ihl, unsigned int ipl)
+{
+ struct igmphdr *igmph;
+
+ igmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*igmph));
+ if (igmph == NULL)
+ return 0;
+
+ igmph->csum = 0;
+ skb->csum = csum_partial(igmph, ipl - ihl, 0);
+ igmph->csum = csum_fold(skb->csum);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return 1;
+}
+
+static int tcf_csum_ipv6_icmp(struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int ihl, unsigned int ipl)
+{
+ struct icmp6hdr *icmp6h;
+
+ icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h));
+ if (icmp6h == NULL)
+ return 0;
+
+ icmp6h->icmp6_cksum = 0;
+ skb->csum = csum_partial(icmp6h, ipl - ihl, 0);
+ icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ ipl - ihl, IPPROTO_ICMPV6,
+ skb->csum);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return 1;
+}
+
+static int tcf_csum_ipv4_tcp(struct sk_buff *skb, struct iphdr *iph,
+ unsigned int ihl, unsigned int ipl)
+{
+ struct tcphdr *tcph;
+
+ tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
+ if (tcph == NULL)
+ return 0;
+
+ tcph->check = 0;
+ skb->csum = csum_partial(tcph, ipl - ihl, 0);
+ tcph->check = tcp_v4_check(ipl - ihl,
+ iph->saddr, iph->daddr, skb->csum);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return 1;
+}
+
+static int tcf_csum_ipv6_tcp(struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int ihl, unsigned int ipl)
+{
+ struct tcphdr *tcph;
+
+ tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
+ if (tcph == NULL)
+ return 0;
+
+ tcph->check = 0;
+ skb->csum = csum_partial(tcph, ipl - ihl, 0);
+ tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ ipl - ihl, IPPROTO_TCP,
+ skb->csum);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return 1;
+}
+
+static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph,
+ unsigned int ihl, unsigned int ipl, int udplite)
+{
+ struct udphdr *udph;
+ u16 ul;
+
+ /*
+ * Support both UDP and UDPLITE checksum algorithms, Don't use
+ * udph->len to get the real length without any protocol check,
+ * UDPLITE uses udph->len for another thing,
+ * Use iph->tot_len, or just ipl.
+ */
+
+ udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph));
+ if (udph == NULL)
+ return 0;
+
+ ul = ntohs(udph->len);
+
+ if (udplite || udph->check) {
+
+ udph->check = 0;
+
+ if (udplite) {
+ if (ul == 0)
+ skb->csum = csum_partial(udph, ipl - ihl, 0);
+ else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl))
+ skb->csum = csum_partial(udph, ul, 0);
+ else
+ goto ignore_obscure_skb;
+ } else {
+ if (ul != ipl - ihl)
+ goto ignore_obscure_skb;
+
+ skb->csum = csum_partial(udph, ul, 0);
+ }
+
+ udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ ul, iph->protocol,
+ skb->csum);
+
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ignore_obscure_skb:
+ return 1;
+}
+
+static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int ihl, unsigned int ipl, int udplite)
+{
+ struct udphdr *udph;
+ u16 ul;
+
+ /*
+ * Support both UDP and UDPLITE checksum algorithms, Don't use
+ * udph->len to get the real length without any protocol check,
+ * UDPLITE uses udph->len for another thing,
+ * Use ip6h->payload_len + sizeof(*ip6h) ... , or just ipl.
+ */
+
+ udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph));
+ if (udph == NULL)
+ return 0;
+
+ ul = ntohs(udph->len);
+
+ udph->check = 0;
+
+ if (udplite) {
+ if (ul == 0)
+ skb->csum = csum_partial(udph, ipl - ihl, 0);
+
+ else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl))
+ skb->csum = csum_partial(udph, ul, 0);
+
+ else
+ goto ignore_obscure_skb;
+ } else {
+ if (ul != ipl - ihl)
+ goto ignore_obscure_skb;
+
+ skb->csum = csum_partial(udph, ul, 0);
+ }
+
+ udph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, ul,
+ udplite ? IPPROTO_UDPLITE : IPPROTO_UDP,
+ skb->csum);
+
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ignore_obscure_skb:
+ return 1;
+}
+
+static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
+{
+ struct iphdr *iph;
+ int ntkoff;
+
+ ntkoff = skb_network_offset(skb);
+
+ if (!pskb_may_pull(skb, sizeof(*iph) + ntkoff))
+ goto fail;
+
+ iph = ip_hdr(skb);
+
+ switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) {
+ case IPPROTO_ICMP:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
+ if (!tcf_csum_ipv4_icmp(skb, iph->ihl * 4,
+ ntohs(iph->tot_len)))
+ goto fail;
+ break;
+ case IPPROTO_IGMP:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_IGMP)
+ if (!tcf_csum_ipv4_igmp(skb, iph->ihl * 4,
+ ntohs(iph->tot_len)))
+ goto fail;
+ break;
+ case IPPROTO_TCP:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
+ if (!tcf_csum_ipv4_tcp(skb, iph, iph->ihl * 4,
+ ntohs(iph->tot_len)))
+ goto fail;
+ break;
+ case IPPROTO_UDP:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
+ if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4,
+ ntohs(iph->tot_len), 0))
+ goto fail;
+ break;
+ case IPPROTO_UDPLITE:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
+ if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4,
+ ntohs(iph->tot_len), 1))
+ goto fail;
+ break;
+ }
+
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) {
+ if (skb_cloned(skb) &&
+ !skb_clone_writable(skb, sizeof(*iph) + ntkoff) &&
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ goto fail;
+
+ ip_send_check(iph);
+ }
+
+ return 1;
+
+fail:
+ return 0;
+}
+
+static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh,
+ unsigned int ixhl, unsigned int *pl)
+{
+ int off, len, optlen;
+ unsigned char *xh = (void *)ip6xh;
+
+ off = sizeof(*ip6xh);
+ len = ixhl - off;
+
+ while (len > 1) {
+ switch (xh[off]) {
+ case IPV6_TLV_PAD0:
+ optlen = 1;
+ break;
+ case IPV6_TLV_JUMBO:
+ optlen = xh[off + 1] + 2;
+ if (optlen != 6 || len < 6 || (off & 3) != 2)
+ /* wrong jumbo option length/alignment */
+ return 0;
+ *pl = ntohl(*(__be32 *)(xh + off + 2));
+ goto done;
+ default:
+ optlen = xh[off + 1] + 2;
+ if (optlen > len)
+ /* ignore obscure options */
+ goto done;
+ break;
+ }
+ off += optlen;
+ len -= optlen;
+ }
+
+done:
+ return 1;
+}
+
+static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
+{
+ struct ipv6hdr *ip6h;
+ struct ipv6_opt_hdr *ip6xh;
+ unsigned int hl, ixhl;
+ unsigned int pl;
+ int ntkoff;
+ u8 nexthdr;
+
+ ntkoff = skb_network_offset(skb);
+
+ hl = sizeof(*ip6h);
+
+ if (!pskb_may_pull(skb, hl + ntkoff))
+ goto fail;
+
+ ip6h = ipv6_hdr(skb);
+
+ pl = ntohs(ip6h->payload_len);
+ nexthdr = ip6h->nexthdr;
+
+ do {
+ switch (nexthdr) {
+ case NEXTHDR_FRAGMENT:
+ goto ignore_skb;
+ case NEXTHDR_ROUTING:
+ case NEXTHDR_HOP:
+ case NEXTHDR_DEST:
+ if (!pskb_may_pull(skb, hl + sizeof(*ip6xh) + ntkoff))
+ goto fail;
+ ip6xh = (void *)(skb_network_header(skb) + hl);
+ ixhl = ipv6_optlen(ip6xh);
+ if (!pskb_may_pull(skb, hl + ixhl + ntkoff))
+ goto fail;
+ if ((nexthdr == NEXTHDR_HOP) &&
+ !(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl)))
+ goto fail;
+ nexthdr = ip6xh->nexthdr;
+ hl += ixhl;
+ break;
+ case IPPROTO_ICMPV6:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
+ if (!tcf_csum_ipv6_icmp(skb, ip6h,
+ hl, pl + sizeof(*ip6h)))
+ goto fail;
+ goto done;
+ case IPPROTO_TCP:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
+ if (!tcf_csum_ipv6_tcp(skb, ip6h,
+ hl, pl + sizeof(*ip6h)))
+ goto fail;
+ goto done;
+ case IPPROTO_UDP:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
+ if (!tcf_csum_ipv6_udp(skb, ip6h, hl,
+ pl + sizeof(*ip6h), 0))
+ goto fail;
+ goto done;
+ case IPPROTO_UDPLITE:
+ if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
+ if (!tcf_csum_ipv6_udp(skb, ip6h, hl,
+ pl + sizeof(*ip6h), 1))
+ goto fail;
+ goto done;
+ default:
+ goto ignore_skb;
+ }
+ } while (pskb_may_pull(skb, hl + 1 + ntkoff));
+
+done:
+ignore_skb:
+ return 1;
+
+fail:
+ return 0;
+}
+
+static int tcf_csum(struct sk_buff *skb,
+ struct tc_action *a, struct tcf_result *res)
+{
+ struct tcf_csum *p = a->priv;
+ int action;
+ u32 update_flags;
+
+ spin_lock(&p->tcf_lock);
+ p->tcf_tm.lastuse = jiffies;
+ p->tcf_bstats.bytes += qdisc_pkt_len(skb);
+ p->tcf_bstats.packets++;
+ action = p->tcf_action;
+ update_flags = p->update_flags;
+ spin_unlock(&p->tcf_lock);
+
+ if (unlikely(action == TC_ACT_SHOT))
+ goto drop;
+
+ switch (skb->protocol) {
+ case cpu_to_be16(ETH_P_IP):
+ if (!tcf_csum_ipv4(skb, update_flags))
+ goto drop;
+ break;
+ case cpu_to_be16(ETH_P_IPV6):
+ if (!tcf_csum_ipv6(skb, update_flags))
+ goto drop;
+ break;
+ }
+
+ return action;
+
+drop:
+ spin_lock(&p->tcf_lock);
+ p->tcf_qstats.drops++;
+ spin_unlock(&p->tcf_lock);
+ return TC_ACT_SHOT;
+}
+
+static int tcf_csum_dump(struct sk_buff *skb,
+ struct tc_action *a, int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_csum *p = a->priv;
+ struct tc_csum opt = {
+ .update_flags = p->update_flags,
+ .index = p->tcf_index,
+ .action = p->tcf_action,
+ .refcnt = p->tcf_refcnt - ref,
+ .bindcnt = p->tcf_bindcnt - bind,
+ };
+ struct tcf_t t;
+
+ NLA_PUT(skb, TCA_CSUM_PARMS, sizeof(opt), &opt);
+ t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
+ t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
+ t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
+ NLA_PUT(skb, TCA_CSUM_TM, sizeof(t), &t);
+
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static struct tc_action_ops act_csum_ops = {
+ .kind = "csum",
+ .hinfo = &csum_hash_info,
+ .type = TCA_ACT_CSUM,
+ .capab = TCA_CAP_NONE,
+ .owner = THIS_MODULE,
+ .act = tcf_csum,
+ .dump = tcf_csum_dump,
+ .cleanup = tcf_csum_cleanup,
+ .lookup = tcf_hash_search,
+ .init = tcf_csum_init,
+ .walk = tcf_generic_walker
+};
+
+MODULE_DESCRIPTION("Checksum updating actions");
+MODULE_LICENSE("GPL");
+
+static int __init csum_init_module(void)
+{
+ return tcf_register_action(&act_csum_ops);
+}
+
+static void __exit csum_cleanup_module(void)
+{
+ tcf_unregister_action(&act_csum_ops);
+}
+
+module_init(csum_init_module);
+module_exit(csum_cleanup_module);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 8406c665499..c2ed90a4c0b 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -152,21 +152,24 @@ static int tcf_gact(struct sk_buff *skb, struct tc_action *a, struct tcf_result
static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
- struct tc_gact opt;
struct tcf_gact *gact = a->priv;
+ struct tc_gact opt = {
+ .index = gact->tcf_index,
+ .refcnt = gact->tcf_refcnt - ref,
+ .bindcnt = gact->tcf_bindcnt - bind,
+ .action = gact->tcf_action,
+ };
struct tcf_t t;
- opt.index = gact->tcf_index;
- opt.refcnt = gact->tcf_refcnt - ref;
- opt.bindcnt = gact->tcf_bindcnt - bind;
- opt.action = gact->tcf_action;
NLA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt);
#ifdef CONFIG_GACT_PROB
if (gact->tcfg_ptype) {
- struct tc_gact_p p_opt;
- p_opt.paction = gact->tcfg_paction;
- p_opt.pval = gact->tcfg_pval;
- p_opt.ptype = gact->tcfg_ptype;
+ struct tc_gact_p p_opt = {
+ .paction = gact->tcfg_paction,
+ .pval = gact->tcfg_pval,
+ .ptype = gact->tcfg_ptype,
+ };
+
NLA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt);
}
#endif
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index c7e59e6ec34..8daef963225 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -39,7 +39,7 @@ static struct tcf_hashinfo ipt_hash_info = {
.lock = &ipt_lock,
};
-static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook)
+static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)
{
struct xt_tgchk_param par;
struct xt_target *target;
@@ -66,7 +66,7 @@ static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int
return 0;
}
-static void ipt_destroy_target(struct ipt_entry_target *t)
+static void ipt_destroy_target(struct xt_entry_target *t)
{
struct xt_tgdtor_param par = {
.target = t->u.kernel.target,
@@ -99,7 +99,7 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
[TCA_IPT_TABLE] = { .type = NLA_STRING, .len = IFNAMSIZ },
[TCA_IPT_HOOK] = { .type = NLA_U32 },
[TCA_IPT_INDEX] = { .type = NLA_U32 },
- [TCA_IPT_TARG] = { .len = sizeof(struct ipt_entry_target) },
+ [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) },
};
static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
@@ -108,7 +108,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
struct nlattr *tb[TCA_IPT_MAX + 1];
struct tcf_ipt *ipt;
struct tcf_common *pc;
- struct ipt_entry_target *td, *t;
+ struct xt_entry_target *td, *t;
char *tname;
int ret = 0, err;
u32 hook = 0;
@@ -126,7 +126,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
if (tb[TCA_IPT_TARG] == NULL)
return -EINVAL;
- td = (struct ipt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
+ td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size)
return -EINVAL;
@@ -230,7 +230,7 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,
result = TC_ACT_SHOT;
ipt->tcf_qstats.drops++;
break;
- case IPT_CONTINUE:
+ case XT_CONTINUE:
result = TC_ACT_PIPE;
break;
default:
@@ -249,7 +249,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_ipt *ipt = a->priv;
- struct ipt_entry_target *t;
+ struct xt_entry_target *t;
struct tcf_t tm;
struct tc_cnt c;
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 11f195af2da..0c311be9282 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -219,15 +219,16 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_mirred *m = a->priv;
- struct tc_mirred opt;
+ struct tc_mirred opt = {
+ .index = m->tcf_index,
+ .action = m->tcf_action,
+ .refcnt = m->tcf_refcnt - ref,
+ .bindcnt = m->tcf_bindcnt - bind,
+ .eaction = m->tcfm_eaction,
+ .ifindex = m->tcfm_ifindex,
+ };
struct tcf_t t;
- opt.index = m->tcf_index;
- opt.action = m->tcf_action;
- opt.refcnt = m->tcf_refcnt - ref;
- opt.bindcnt = m->tcf_bindcnt - bind;
- opt.eaction = m->tcfm_eaction;
- opt.ifindex = m->tcfm_ifindex;
NLA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt);
t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse);
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 509a2d53a99..186eb837e60 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -272,19 +272,19 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_nat *p = a->priv;
- struct tc_nat opt;
+ struct tc_nat opt = {
+ .old_addr = p->old_addr,
+ .new_addr = p->new_addr,
+ .mask = p->mask,
+ .flags = p->flags,
+
+ .index = p->tcf_index,
+ .action = p->tcf_action,
+ .refcnt = p->tcf_refcnt - ref,
+ .bindcnt = p->tcf_bindcnt - bind,
+ };
struct tcf_t t;
- opt.old_addr = p->old_addr;
- opt.new_addr = p->new_addr;
- opt.mask = p->mask;
- opt.flags = p->flags;
-
- opt.index = p->tcf_index;
- opt.action = p->tcf_action;
- opt.refcnt = p->tcf_refcnt - ref;
- opt.bindcnt = p->tcf_bindcnt - bind;
-
NLA_PUT(skb, TCA_NAT_PARMS, sizeof(opt), &opt);
t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 537a48732e9..7ebf7439b47 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -350,22 +350,19 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_police *police = a->priv;
- struct tc_police opt;
-
- opt.index = police->tcf_index;
- opt.action = police->tcf_action;
- opt.mtu = police->tcfp_mtu;
- opt.burst = police->tcfp_burst;
- opt.refcnt = police->tcf_refcnt - ref;
- opt.bindcnt = police->tcf_bindcnt - bind;
+ struct tc_police opt = {
+ .index = police->tcf_index,
+ .action = police->tcf_action,
+ .mtu = police->tcfp_mtu,
+ .burst = police->tcfp_burst,
+ .refcnt = police->tcf_refcnt - ref,
+ .bindcnt = police->tcf_bindcnt - bind,
+ };
+
if (police->tcfp_R_tab)
opt.rate = police->tcfp_R_tab->rate;
- else
- memset(&opt.rate, 0, sizeof(opt.rate));
if (police->tcfp_P_tab)
opt.peakrate = police->tcfp_P_tab->rate;
- else
- memset(&opt.peakrate, 0, sizeof(opt.peakrate));
NLA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
if (police->tcfp_result)
NLA_PUT_U32(skb, TCA_POLICE_RESULT, police->tcfp_result);
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 4a1d640b0cf..97e84f3ee77 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -164,13 +164,14 @@ static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_defact *d = a->priv;
- struct tc_defact opt;
+ struct tc_defact opt = {
+ .index = d->tcf_index,
+ .refcnt = d->tcf_refcnt - ref,
+ .bindcnt = d->tcf_bindcnt - bind,
+ .action = d->tcf_action,
+ };
struct tcf_t t;
- opt.index = d->tcf_index;
- opt.refcnt = d->tcf_refcnt - ref;
- opt.bindcnt = d->tcf_bindcnt - bind;
- opt.action = d->tcf_action;
NLA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt);
NLA_PUT_STRING(skb, TCA_DEF_DATA, d->tcfd_defdata);
t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index e9607fe55b5..66cbf4eb885 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -159,13 +159,14 @@ static inline int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_skbedit *d = a->priv;
- struct tc_skbedit opt;
+ struct tc_skbedit opt = {
+ .index = d->tcf_index,
+ .refcnt = d->tcf_refcnt - ref,
+ .bindcnt = d->tcf_bindcnt - bind,
+ .action = d->tcf_action,
+ };
struct tcf_t t;
- opt.index = d->tcf_index;
- opt.refcnt = d->tcf_refcnt - ref;
- opt.bindcnt = d->tcf_bindcnt - bind;
- opt.action = d->tcf_action;
NLA_PUT(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt);
if (d->flags & SKBEDIT_F_PRIORITY)
NLA_PUT(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority),
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 78ef2c5e130..37dff78e9cb 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -123,7 +123,7 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
* calls by looking at the number of nested bh disable calls because
* softirqs always disables bh.
*/
- if (softirq_count() != SOFTIRQ_OFFSET) {
+ if (in_serving_softirq()) {
/* If there is an sk_classid we'll use that. */
if (!skb->sk)
return -1;
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index e17096e3913..5b271a18bc3 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -111,44 +111,41 @@ static u32 flow_get_proto(struct sk_buff *skb)
}
}
-static int has_ports(u8 protocol)
-{
- switch (protocol) {
- case IPPROTO_TCP:
- case IPPROTO_UDP:
- case IPPROTO_UDPLITE:
- case IPPROTO_SCTP:
- case IPPROTO_DCCP:
- case IPPROTO_ESP:
- return 1;
- default:
- return 0;
- }
-}
-
static u32 flow_get_proto_src(struct sk_buff *skb)
{
switch (skb->protocol) {
case htons(ETH_P_IP): {
struct iphdr *iph;
+ int poff;
if (!pskb_network_may_pull(skb, sizeof(*iph)))
break;
iph = ip_hdr(skb);
- if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
- has_ports(iph->protocol) &&
- pskb_network_may_pull(skb, iph->ihl * 4 + 2))
- return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4));
+ if (iph->frag_off & htons(IP_MF|IP_OFFSET))
+ break;
+ poff = proto_ports_offset(iph->protocol);
+ if (poff >= 0 &&
+ pskb_network_may_pull(skb, iph->ihl * 4 + 2 + poff)) {
+ iph = ip_hdr(skb);
+ return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 +
+ poff));
+ }
break;
}
case htons(ETH_P_IPV6): {
struct ipv6hdr *iph;
+ int poff;
- if (!pskb_network_may_pull(skb, sizeof(*iph) + 2))
+ if (!pskb_network_may_pull(skb, sizeof(*iph)))
break;
iph = ipv6_hdr(skb);
- if (has_ports(iph->nexthdr))
- return ntohs(*(__be16 *)&iph[1]);
+ poff = proto_ports_offset(iph->nexthdr);
+ if (poff >= 0 &&
+ pskb_network_may_pull(skb, sizeof(*iph) + poff + 2)) {
+ iph = ipv6_hdr(skb);
+ return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) +
+ poff));
+ }
break;
}
}
@@ -161,24 +158,36 @@ static u32 flow_get_proto_dst(struct sk_buff *skb)
switch (skb->protocol) {
case htons(ETH_P_IP): {
struct iphdr *iph;
+ int poff;
if (!pskb_network_may_pull(skb, sizeof(*iph)))
break;
iph = ip_hdr(skb);
- if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
- has_ports(iph->protocol) &&
- pskb_network_may_pull(skb, iph->ihl * 4 + 4))
- return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + 2));
+ if (iph->frag_off & htons(IP_MF|IP_OFFSET))
+ break;
+ poff = proto_ports_offset(iph->protocol);
+ if (poff >= 0 &&
+ pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
+ iph = ip_hdr(skb);
+ return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 +
+ 2 + poff));
+ }
break;
}
case htons(ETH_P_IPV6): {
struct ipv6hdr *iph;
+ int poff;
- if (!pskb_network_may_pull(skb, sizeof(*iph) + 4))
+ if (!pskb_network_may_pull(skb, sizeof(*iph)))
break;
iph = ipv6_hdr(skb);
- if (has_ports(iph->nexthdr))
- return ntohs(*(__be16 *)((void *)&iph[1] + 2));
+ poff = proto_ports_offset(iph->nexthdr);
+ if (poff >= 0 &&
+ pskb_network_may_pull(skb, sizeof(*iph) + poff + 4)) {
+ iph = ipv6_hdr(skb);
+ return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) +
+ poff + 2));
+ }
break;
}
}
@@ -297,6 +306,11 @@ static u32 flow_get_vlan_tag(const struct sk_buff *skb)
return tag & VLAN_VID_MASK;
}
+static u32 flow_get_rxhash(struct sk_buff *skb)
+{
+ return skb_get_rxhash(skb);
+}
+
static u32 flow_key_get(struct sk_buff *skb, int key)
{
switch (key) {
@@ -334,6 +348,8 @@ static u32 flow_key_get(struct sk_buff *skb, int key)
return flow_get_skgid(skb);
case FLOW_KEY_VLAN_TAG:
return flow_get_vlan_tag(skb);
+ case FLOW_KEY_RXHASH:
+ return flow_get_rxhash(skb);
default:
WARN_ON(1);
return 0;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 7416a5c73b2..b0c2a82178a 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -137,7 +137,7 @@ next_knode:
int toff = off + key->off + (off2 & key->offmask);
__be32 *data, _data;
- if (skb_headroom(skb) + toff < 0)
+ if (skb_headroom(skb) + toff > INT_MAX)
goto out;
data = skb_header_pointer(skb, toff, 4, &_data);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 3bcac8aa333..34da5e29ea1 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -223,6 +223,11 @@ META_COLLECTOR(int_maclen)
dst->value = skb->mac_len;
}
+META_COLLECTOR(int_rxhash)
+{
+ dst->value = skb_get_rxhash(skb);
+}
+
/**************************************************************************
* Netfilter
**************************************************************************/
@@ -541,6 +546,7 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
[META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off),
[META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend),
[META_ID(VLAN_TAG)] = META_FUNC(int_vlan_tag),
+ [META_ID(RXHASH)] = META_FUNC(int_rxhash),
}
};
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b9e8c3b7d40..b22ca2d1ceb 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -150,22 +150,34 @@ int register_qdisc(struct Qdisc_ops *qops)
if (qops->enqueue == NULL)
qops->enqueue = noop_qdisc_ops.enqueue;
if (qops->peek == NULL) {
- if (qops->dequeue == NULL) {
+ if (qops->dequeue == NULL)
qops->peek = noop_qdisc_ops.peek;
- } else {
- rc = -EINVAL;
- goto out;
- }
+ else
+ goto out_einval;
}
if (qops->dequeue == NULL)
qops->dequeue = noop_qdisc_ops.dequeue;
+ if (qops->cl_ops) {
+ const struct Qdisc_class_ops *cops = qops->cl_ops;
+
+ if (!(cops->get && cops->put && cops->walk && cops->leaf))
+ goto out_einval;
+
+ if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
+ goto out_einval;
+ }
+
qops->next = NULL;
*qp = qops;
rc = 0;
out:
write_unlock(&qdisc_mod_lock);
return rc;
+
+out_einval:
+ rc = -EINVAL;
+ goto out;
}
EXPORT_SYMBOL(register_qdisc);
@@ -228,7 +240,10 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
if (q)
goto out;
- q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
+ if (dev_ingress_queue(dev))
+ q = qdisc_match_from_root(
+ dev_ingress_queue(dev)->qdisc_sleeping,
+ handle);
out:
return q;
}
@@ -348,7 +363,7 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
}
- if (!s || tsize != s->tsize || (!tab && tsize > 0))
+ if (tsize != s->tsize || (!tab && tsize > 0))
return ERR_PTR(-EINVAL);
spin_lock(&qdisc_stab_lock);
@@ -678,6 +693,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
(new && new->flags & TCQ_F_INGRESS)) {
num_q = 1;
ingress = 1;
+ if (!dev_ingress_queue(dev))
+ return -ENOENT;
}
if (dev->flags & IFF_UP)
@@ -689,7 +706,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
}
for (i = 0; i < num_q; i++) {
- struct netdev_queue *dev_queue = &dev->rx_queue;
+ struct netdev_queue *dev_queue = dev_ingress_queue(dev);
if (!ingress)
dev_queue = netdev_get_tx_queue(dev, i);
@@ -967,7 +984,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
return -ENOENT;
q = qdisc_leaf(p, clid);
} else { /* ingress */
- q = dev->rx_queue.qdisc_sleeping;
+ if (dev_ingress_queue(dev))
+ q = dev_ingress_queue(dev)->qdisc_sleeping;
}
} else {
q = dev->qdisc;
@@ -1031,8 +1049,9 @@ replay:
if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
return -ENOENT;
q = qdisc_leaf(p, clid);
- } else { /*ingress */
- q = dev->rx_queue.qdisc_sleeping;
+ } else { /* ingress */
+ if (dev_ingress_queue_create(dev))
+ q = dev_ingress_queue(dev)->qdisc_sleeping;
}
} else {
q = dev->qdisc;
@@ -1111,11 +1130,14 @@ replay:
create_n_graft:
if (!(n->nlmsg_flags&NLM_F_CREATE))
return -ENOENT;
- if (clid == TC_H_INGRESS)
- q = qdisc_create(dev, &dev->rx_queue, p,
- tcm->tcm_parent, tcm->tcm_parent,
- tca, &err);
- else {
+ if (clid == TC_H_INGRESS) {
+ if (dev_ingress_queue(dev))
+ q = qdisc_create(dev, dev_ingress_queue(dev), p,
+ tcm->tcm_parent, tcm->tcm_parent,
+ tca, &err);
+ else
+ err = -ENOENT;
+ } else {
struct netdev_queue *dev_queue;
if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
@@ -1292,8 +1314,10 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
goto done;
- dev_queue = &dev->rx_queue;
- if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
+ dev_queue = dev_ingress_queue(dev);
+ if (dev_queue &&
+ tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
+ &q_idx, s_q_idx) < 0)
goto done;
cont:
@@ -1583,8 +1607,10 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
goto done;
- dev_queue = &dev->rx_queue;
- if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
+ dev_queue = dev_ingress_queue(dev);
+ if (dev_queue &&
+ tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
+ &t, s_t) < 0)
goto done;
done:
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index e114f23d5ea..282540778aa 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -255,10 +255,6 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
error = -EINVAL;
goto err_out;
}
- if (!list_empty(&flow->list)) {
- error = -EEXIST;
- goto err_out;
- }
} else {
int i;
unsigned long cl;
@@ -279,8 +275,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
goto err_out;
}
flow->filter_list = NULL;
- flow->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
- &pfifo_qdisc_ops, classid);
+ flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
if (!flow->q)
flow->q = &noop_qdisc;
pr_debug("atm_tc_change: qdisc %p\n", flow->q);
@@ -418,7 +413,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
ret = qdisc_enqueue(skb, flow->q);
- if (ret != 0) {
+ if (ret != NET_XMIT_SUCCESS) {
drop: __maybe_unused
if (net_xmit_drop_count(ret)) {
sch->qstats.drops++;
@@ -442,7 +437,7 @@ drop: __maybe_unused
*/
if (flow == &p->link) {
sch->q.qlen++;
- return 0;
+ return NET_XMIT_SUCCESS;
}
tasklet_schedule(&p->task);
return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
@@ -547,7 +542,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt)
INIT_LIST_HEAD(&p->flows);
INIT_LIST_HEAD(&p->link.list);
list_add(&p->link.list, &p->flows);
- p->link.q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
+ p->link.q = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops, sch->handle);
if (!p->link.q)
p->link.q = &noop_qdisc;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 28c01ef5abc..eb763159086 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1379,9 +1379,9 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
q->link.sibling = &q->link;
q->link.common.classid = sch->handle;
q->link.qdisc = sch;
- if (!(q->link.q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
- &pfifo_qdisc_ops,
- sch->handle)))
+ q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ sch->handle);
+ if (!q->link.q)
q->link.q = &noop_qdisc;
q->link.priority = TC_CBQ_MAXPRIO-1;
@@ -1623,7 +1623,7 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct cbq_class *cl = (struct cbq_class*)arg;
if (new == NULL) {
- new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
+ new = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops, cl->common.classid);
if (new == NULL)
return -ENOBUFS;
@@ -1874,8 +1874,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
cl->R_tab = rtab;
rtab = NULL;
cl->refcnt = 1;
- if (!(cl->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
- &pfifo_qdisc_ops, classid)))
+ cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
+ if (!cl->q)
cl->q = &noop_qdisc;
cl->common.classid = classid;
cl->tparent = parent;
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index b74046a9539..aa8b5313f8c 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -110,7 +110,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
cl->refcnt = 1;
cl->common.classid = classid;
cl->quantum = quantum;
- cl->qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
+ cl->qdisc = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops, classid);
if (cl->qdisc == NULL)
cl->qdisc = &noop_qdisc;
@@ -218,7 +218,7 @@ static int drr_graft_class(struct Qdisc *sch, unsigned long arg,
struct drr_class *cl = (struct drr_class *)arg;
if (new == NULL) {
- new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
+ new = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops, cl->common.classid);
if (new == NULL)
new = &noop_qdisc;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 63d41f86679..1d295d62bb5 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -61,8 +61,7 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
sch, p, new, old);
if (new == NULL) {
- new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
- &pfifo_qdisc_ops,
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
sch->handle);
if (new == NULL)
new = &noop_qdisc;
@@ -384,8 +383,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
p->default_index = default_index;
p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]);
- p->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
- &pfifo_qdisc_ops, sch->handle);
+ p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle);
if (p->q == NULL)
p->q = &noop_qdisc;
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 5948bafa8ce..4dfecb0cba3 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -172,8 +172,7 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
struct Qdisc *q;
int err = -ENOMEM;
- q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
- ops, TC_H_MAKE(sch->handle, 1));
+ q = qdisc_create_dflt(sch->dev_queue, ops, TC_H_MAKE(sch->handle, 1));
if (q) {
err = fifo_set_limit(q, limit);
if (err < 0) {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 2aeb3a4386a..5dbb3cd96e5 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -383,6 +383,7 @@ struct Qdisc noop_qdisc = {
.list = LIST_HEAD_INIT(noop_qdisc.list),
.q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
.dev_queue = &noop_netdev_queue,
+ .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
};
EXPORT_SYMBOL(noop_qdisc);
@@ -409,6 +410,7 @@ static struct Qdisc noqueue_qdisc = {
.list = LIST_HEAD_INIT(noqueue_qdisc.list),
.q.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
.dev_queue = &noqueue_netdev_queue,
+ .busylock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock),
};
@@ -574,10 +576,8 @@ errout:
return ERR_PTR(err);
}
-struct Qdisc * qdisc_create_dflt(struct net_device *dev,
- struct netdev_queue *dev_queue,
- struct Qdisc_ops *ops,
- unsigned int parentid)
+struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
+ struct Qdisc_ops *ops, unsigned int parentid)
{
struct Qdisc *sch;
@@ -682,7 +682,7 @@ static void attach_one_default_qdisc(struct net_device *dev,
struct Qdisc *qdisc;
if (dev->tx_queue_len) {
- qdisc = qdisc_create_dflt(dev, dev_queue,
+ qdisc = qdisc_create_dflt(dev_queue,
&pfifo_fast_ops, TC_H_ROOT);
if (!qdisc) {
printk(KERN_INFO "%s: activation failed\n", dev->name);
@@ -709,7 +709,7 @@ static void attach_default_qdiscs(struct net_device *dev)
dev->qdisc = txq->qdisc_sleeping;
atomic_inc(&dev->qdisc->refcnt);
} else {
- qdisc = qdisc_create_dflt(dev, txq, &mq_qdisc_ops, TC_H_ROOT);
+ qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
if (qdisc) {
qdisc->ops->attach(qdisc);
dev->qdisc = qdisc;
@@ -753,7 +753,8 @@ void dev_activate(struct net_device *dev)
need_watchdog = 0;
netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
- transition_one_qdisc(dev, &dev->rx_queue, NULL);
+ if (dev_ingress_queue(dev))
+ transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
if (need_watchdog) {
dev->trans_start = jiffies;
@@ -812,7 +813,8 @@ static bool some_qdisc_is_busy(struct net_device *dev)
void dev_deactivate(struct net_device *dev)
{
netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc);
- dev_deactivate_queue(dev, &dev->rx_queue, &noop_qdisc);
+ if (dev_ingress_queue(dev))
+ dev_deactivate_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
dev_watchdog_down(dev);
@@ -838,7 +840,8 @@ void dev_init_scheduler(struct net_device *dev)
{
dev->qdisc = &noop_qdisc;
netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
- dev_init_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
+ if (dev_ingress_queue(dev))
+ dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
}
@@ -861,7 +864,8 @@ static void shutdown_scheduler_queue(struct net_device *dev,
void dev_shutdown(struct net_device *dev)
{
netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
- shutdown_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
+ if (dev_ingress_queue(dev))
+ shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
qdisc_destroy(dev->qdisc);
dev->qdisc = &noop_qdisc;
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index abd904be428..069c62b7bb3 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -761,8 +761,8 @@ init_vf(struct hfsc_class *cl, unsigned int len)
if (f != cl->cl_f) {
cl->cl_f = f;
cftree_update(cl);
- update_cfmin(cl->cl_parent);
}
+ update_cfmin(cl->cl_parent);
}
}
@@ -1088,7 +1088,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
cl->refcnt = 1;
cl->sched = q;
cl->cl_parent = parent;
- cl->qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
+ cl->qdisc = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops, classid);
if (cl->qdisc == NULL)
cl->qdisc = &noop_qdisc;
@@ -1209,8 +1209,7 @@ hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
if (cl->level > 0)
return -EINVAL;
if (new == NULL) {
- new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
- &pfifo_qdisc_ops,
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
cl->cl_common.classid);
if (new == NULL)
new = &noop_qdisc;
@@ -1452,8 +1451,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
q->root.cl_common.classid = sch->handle;
q->root.refcnt = 1;
q->root.sched = q;
- q->root.qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
- &pfifo_qdisc_ops,
+ q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
sch->handle);
if (q->root.qdisc == NULL)
q->root.qdisc = &noop_qdisc;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 4be8d04b262..01b519d6c52 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1121,8 +1121,7 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
if (cl->level)
return -EINVAL;
if (new == NULL &&
- (new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
- &pfifo_qdisc_ops,
+ (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
cl->common.classid)) == NULL)
return -ENOBUFS;
@@ -1247,8 +1246,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
return -EBUSY;
if (!cl->level && htb_parent_last_child(cl)) {
- new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
- &pfifo_qdisc_ops,
+ new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
cl->parent->common.classid);
last_child = 1;
}
@@ -1302,14 +1300,14 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
struct htb_class *cl = (struct htb_class *)*arg, *parent;
struct nlattr *opt = tca[TCA_OPTIONS];
struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
- struct nlattr *tb[TCA_HTB_RTAB + 1];
+ struct nlattr *tb[__TCA_HTB_MAX];
struct tc_htb_opt *hopt;
/* extract all subattrs from opt attr */
if (!opt)
goto failure;
- err = nla_parse_nested(tb, TCA_HTB_RTAB, opt, htb_policy);
+ err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
if (err < 0)
goto failure;
@@ -1377,7 +1375,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
so that can't be used inside of sch_tree_lock
-- thanks to Karlis Peisenieks */
- new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
+ new_q = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops, classid);
sch_tree_lock(sch);
if (parent && !parent->level) {
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index fe91e50f9d9..ecc302f4d2a 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -56,7 +56,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
dev_queue = netdev_get_tx_queue(dev, ntx);
- qdisc = qdisc_create_dflt(dev, dev_queue, &pfifo_fast_ops,
+ qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
TC_H_MAKE(TC_H_MAJ(sch->handle),
TC_H_MIN(ntx + 1)));
if (qdisc == NULL)
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 6ae251279fc..32690deab5d 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -227,8 +227,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
for (i = 0; i < q->bands; i++) {
if (q->queues[i] == &noop_qdisc) {
struct Qdisc *child, *old;
- child = qdisc_create_dflt(qdisc_dev(sch),
- sch->dev_queue,
+ child = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops,
TC_H_MAKE(sch->handle,
i + 1));
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 4714ff162bb..e5593c083a7 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -538,8 +538,7 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt)
qdisc_watchdog_init(&q->watchdog, sch);
- q->qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
- &tfifo_qdisc_ops,
+ q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops,
TC_H_MAKE(sch->handle, 1));
if (!q->qdisc) {
pr_debug("netem: qdisc create failed\n");
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 0748fb1e3a4..b1c95bce33c 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -200,7 +200,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
for (i=0; i<q->bands; i++) {
if (q->queues[i] == &noop_qdisc) {
struct Qdisc *child, *old;
- child = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
+ child = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops,
TC_H_MAKE(sch->handle, i + 1));
if (child) {
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 534f33231c1..3cf478d012d 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -123,40 +123,39 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
case htons(ETH_P_IP):
{
const struct iphdr *iph;
+ int poff;
if (!pskb_network_may_pull(skb, sizeof(*iph)))
goto err;
iph = ip_hdr(skb);
h = (__force u32)iph->daddr;
h2 = (__force u32)iph->saddr ^ iph->protocol;
- if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
- (iph->protocol == IPPROTO_TCP ||
- iph->protocol == IPPROTO_UDP ||
- iph->protocol == IPPROTO_UDPLITE ||
- iph->protocol == IPPROTO_SCTP ||
- iph->protocol == IPPROTO_DCCP ||
- iph->protocol == IPPROTO_ESP) &&
- pskb_network_may_pull(skb, iph->ihl * 4 + 4))
- h2 ^= *(((u32*)iph) + iph->ihl);
+ if (iph->frag_off & htons(IP_MF|IP_OFFSET))
+ break;
+ poff = proto_ports_offset(iph->protocol);
+ if (poff >= 0 &&
+ pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
+ iph = ip_hdr(skb);
+ h2 ^= *(u32*)((void *)iph + iph->ihl * 4 + poff);
+ }
break;
}
case htons(ETH_P_IPV6):
{
struct ipv6hdr *iph;
+ int poff;
if (!pskb_network_may_pull(skb, sizeof(*iph)))
goto err;
iph = ipv6_hdr(skb);
h = (__force u32)iph->daddr.s6_addr32[3];
h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
- if ((iph->nexthdr == IPPROTO_TCP ||
- iph->nexthdr == IPPROTO_UDP ||
- iph->nexthdr == IPPROTO_UDPLITE ||
- iph->nexthdr == IPPROTO_SCTP ||
- iph->nexthdr == IPPROTO_DCCP ||
- iph->nexthdr == IPPROTO_ESP) &&
- pskb_network_may_pull(skb, sizeof(*iph) + 4))
- h2 ^= *(u32*)&iph[1];
+ poff = proto_ports_offset(iph->nexthdr);
+ if (poff >= 0 &&
+ pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) {
+ iph = ipv6_hdr(skb);
+ h2 ^= *(u32*)((void *)iph + sizeof(*iph) + poff);
+ }
break;
}
default:
@@ -334,7 +333,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (++sch->q.qlen <= q->limit) {
sch->bstats.bytes += qdisc_pkt_len(skb);
sch->bstats.packets++;
- return 0;
+ return NET_XMIT_SUCCESS;
}
sfq_drop(sch);
@@ -508,6 +507,11 @@ nla_put_failure:
return -1;
}
+static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
{
return 0;
@@ -519,6 +523,10 @@ static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
return 0;
}
+static void sfq_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
{
struct sfq_sched_data *q = qdisc_priv(sch);
@@ -571,9 +579,12 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
}
static const struct Qdisc_class_ops sfq_class_ops = {
+ .leaf = sfq_leaf,
.get = sfq_get,
+ .put = sfq_put,
.tcf_chain = sfq_find_tcf,
.bind_tcf = sfq_bind,
+ .unbind_tcf = sfq_put,
.dump = sfq_dump_class,
.dump_stats = sfq_dump_class_stats,
.walk = sfq_walk,
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 0991c640cd3..641a30d6463 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -127,7 +127,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
return qdisc_reshape_fail(skb, sch);
ret = qdisc_enqueue(skb, q->qdisc);
- if (ret != 0) {
+ if (ret != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret))
sch->qstats.drops++;
return ret;
@@ -136,7 +136,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
sch->q.qlen++;
sch->bstats.bytes += qdisc_pkt_len(skb);
sch->bstats.packets++;
- return 0;
+ return NET_XMIT_SUCCESS;
}
static unsigned int tbf_drop(struct Qdisc* sch)
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 807643bdcba..401af959670 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -85,7 +85,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc* sch)
__skb_queue_tail(&q->q, skb);
sch->bstats.bytes += qdisc_pkt_len(skb);
sch->bstats.packets++;
- return 0;
+ return NET_XMIT_SUCCESS;
}
kfree_skb(skb);
@@ -241,11 +241,11 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *
}
if (neigh_event_send(n, skb_res) == 0) {
int err;
+ char haddr[MAX_ADDR_LEN];
- read_lock(&n->lock);
- err = dev_hard_header(skb, dev, ntohs(skb->protocol),
- n->ha, NULL, skb->len);
- read_unlock(&n->lock);
+ neigh_ha_snapshot(haddr, n, dev);
+ err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr,
+ NULL, skb->len);
if (err < 0) {
neigh_release(n);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 0b85e525643..5f1fb8bd862 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -48,6 +48,8 @@
* be incorporated into the next SCTP release.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/poll.h>
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 86366390038..ddbbf7c81fa 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -543,16 +543,20 @@ struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc)
id = ntohs(hmacs->hmac_ids[i]);
/* Check the id is in the supported range */
- if (id > SCTP_AUTH_HMAC_ID_MAX)
+ if (id > SCTP_AUTH_HMAC_ID_MAX) {
+ id = 0;
continue;
+ }
/* See is we support the id. Supported IDs have name and
* length fields set, so that we can allocated and use
* them. We can safely just check for name, for without the
* name, we can't allocate the TFM.
*/
- if (!sctp_hmac_list[id].hmac_name)
+ if (!sctp_hmac_list[id].hmac_name) {
+ id = 0;
continue;
+ }
break;
}
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 476caaf100e..6c8556459a7 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -37,6 +37,8 @@
* be incorporated into the next SCTP release.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/net.h>
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index ccb6dc48d15..397296fb156 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -43,6 +43,8 @@
* be incorporated into the next SCTP release.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>
#include <linux/interrupt.h>
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 732689140fb..95e0c8eda1a 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -47,6 +47,8 @@
* be incorporated into the next SCTP release.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/types.h>
@@ -336,7 +338,7 @@ static void sctp_v6_get_saddr(struct sctp_sock *sk,
memcpy(saddr, baddr, sizeof(union sctp_addr));
SCTP_DEBUG_PRINTK("saddr: %pI6\n", &saddr->v6.sin6_addr);
} else {
- printk(KERN_ERR "%s: asoc:%p Could not find a valid source "
+ pr_err("%s: asoc:%p Could not find a valid source "
"address for the dest:%pI6\n",
__func__, asoc, &daddr->v6.sin6_addr);
}
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index f73ec0ea93b..8ef8e7d9eb6 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -38,6 +38,8 @@
* be incorporated into the next SCTP release.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <net/sctp/sctp.h>
@@ -134,8 +136,7 @@ void sctp_dbg_objcnt_init(void)
ent = proc_create("sctp_dbg_objcnt", 0,
proc_net_sctp, &sctp_objcnt_ops);
if (!ent)
- printk(KERN_WARNING
- "sctp_dbg_objcnt: Unable to create /proc entry.\n");
+ pr_warn("sctp_dbg_objcnt: Unable to create /proc entry.\n");
}
/* Cleanup the objcount entry in the proc filesystem. */
diff --git a/net/sctp/output.c b/net/sctp/output.c
index a646681f5ac..60600d337a3 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -41,6 +41,8 @@
* be incorporated into the next SCTP release.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/wait.h>
@@ -92,7 +94,6 @@ struct sctp_packet *sctp_packet_config(struct sctp_packet *packet,
SCTP_DEBUG_PRINTK("%s: packet:%p vtag:0x%x\n", __func__,
packet, vtag);
- sctp_packet_reset(packet);
packet->vtag = vtag;
if (ecn_capable && sctp_packet_empty(packet)) {
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index c04b2eb5918..8c6d379b4bb 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -46,6 +46,8 @@
* be incorporated into the next SCTP release.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/list.h> /* For struct list_head */
#include <linux/socket.h>
@@ -1463,23 +1465,23 @@ static void sctp_check_transmitted(struct sctp_outq *q,
/* Display the end of the
* current range.
*/
- SCTP_DEBUG_PRINTK("-%08x",
- dbg_last_ack_tsn);
+ SCTP_DEBUG_PRINTK_CONT("-%08x",
+ dbg_last_ack_tsn);
}
/* Start a new range. */
- SCTP_DEBUG_PRINTK(",%08x", tsn);
+ SCTP_DEBUG_PRINTK_CONT(",%08x", tsn);
dbg_ack_tsn = tsn;
break;
case 1: /* The last TSN was NOT ACKed. */
if (dbg_last_kept_tsn != dbg_kept_tsn) {
/* Display the end of current range. */
- SCTP_DEBUG_PRINTK("-%08x",
- dbg_last_kept_tsn);
+ SCTP_DEBUG_PRINTK_CONT("-%08x",
+ dbg_last_kept_tsn);
}
- SCTP_DEBUG_PRINTK("\n");
+ SCTP_DEBUG_PRINTK_CONT("\n");
/* FALL THROUGH... */
default:
@@ -1526,18 +1528,18 @@ static void sctp_check_transmitted(struct sctp_outq *q,
break;
if (dbg_last_kept_tsn != dbg_kept_tsn)
- SCTP_DEBUG_PRINTK("-%08x",
- dbg_last_kept_tsn);
+ SCTP_DEBUG_PRINTK_CONT("-%08x",
+ dbg_last_kept_tsn);
- SCTP_DEBUG_PRINTK(",%08x", tsn);
+ SCTP_DEBUG_PRINTK_CONT(",%08x", tsn);
dbg_kept_tsn = tsn;
break;
case 0:
if (dbg_last_ack_tsn != dbg_ack_tsn)
- SCTP_DEBUG_PRINTK("-%08x",
- dbg_last_ack_tsn);
- SCTP_DEBUG_PRINTK("\n");
+ SCTP_DEBUG_PRINTK_CONT("-%08x",
+ dbg_last_ack_tsn);
+ SCTP_DEBUG_PRINTK_CONT("\n");
/* FALL THROUGH... */
default:
@@ -1556,17 +1558,17 @@ static void sctp_check_transmitted(struct sctp_outq *q,
switch (dbg_prt_state) {
case 0:
if (dbg_last_ack_tsn != dbg_ack_tsn) {
- SCTP_DEBUG_PRINTK("-%08x\n", dbg_last_ack_tsn);
+ SCTP_DEBUG_PRINTK_CONT("-%08x\n", dbg_last_ack_tsn);
} else {
- SCTP_DEBUG_PRINTK("\n");
+ SCTP_DEBUG_PRINTK_CONT("\n");
}
break;
case 1:
if (dbg_last_kept_tsn != dbg_kept_tsn) {
- SCTP_DEBUG_PRINTK("-%08x\n", dbg_last_kept_tsn);
+ SCTP_DEBUG_PRINTK_CONT("-%08x\n", dbg_last_kept_tsn);
} else {
- SCTP_DEBUG_PRINTK("\n");
+ SCTP_DEBUG_PRINTK_CONT("\n");
}
}
#endif /* SCTP_DEBUG */
diff --git a/net/sctp/probe.c b/net/sctp/probe.c
index db3a42b8b34..bc6cd75cc1d 100644
--- a/net/sctp/probe.c
+++ b/net/sctp/probe.c
@@ -22,6 +22,8 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/kprobes.h>
#include <linux/socket.h>
@@ -117,6 +119,7 @@ static const struct file_operations sctpprobe_fops = {
.owner = THIS_MODULE,
.open = sctpprobe_open,
.read = sctpprobe_read,
+ .llseek = noop_llseek,
};
sctp_disposition_t jsctp_sf_eat_sack(const struct sctp_endpoint *ep,
@@ -192,7 +195,7 @@ static __init int sctpprobe_init(void)
if (ret)
goto remove_proc;
- pr_info("SCTP probe registered (port=%d)\n", port);
+ pr_info("probe registered (port=%d)\n", port);
return 0;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 5027b83f1cc..1ef29c74d85 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -46,6 +46,8 @@
* be incorporated into the next SCTP release.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/init.h>
#include <linux/netdevice.h>
@@ -707,8 +709,7 @@ static int sctp_ctl_sock_init(void)
&init_net);
if (err < 0) {
- printk(KERN_ERR
- "SCTP: Failed to create the SCTP control socket.\n");
+ pr_err("Failed to create the SCTP control socket\n");
return err;
}
return 0;
@@ -798,7 +799,7 @@ static void sctp_inet_skb_msgname(struct sk_buff *skb, char *msgname, int *len)
static int sctp_inet_af_supported(sa_family_t family, struct sctp_sock *sp)
{
/* PF_INET only supports AF_INET addresses. */
- return (AF_INET == family);
+ return AF_INET == family;
}
/* Address matching with wildcards allowed. */
@@ -1206,7 +1207,7 @@ SCTP_STATIC __init int sctp_init(void)
__get_free_pages(GFP_ATOMIC, order);
} while (!sctp_assoc_hashtable && --order > 0);
if (!sctp_assoc_hashtable) {
- printk(KERN_ERR "SCTP: Failed association hash alloc.\n");
+ pr_err("Failed association hash alloc\n");
status = -ENOMEM;
goto err_ahash_alloc;
}
@@ -1220,7 +1221,7 @@ SCTP_STATIC __init int sctp_init(void)
sctp_ep_hashtable = (struct sctp_hashbucket *)
kmalloc(64 * sizeof(struct sctp_hashbucket), GFP_KERNEL);
if (!sctp_ep_hashtable) {
- printk(KERN_ERR "SCTP: Failed endpoint_hash alloc.\n");
+ pr_err("Failed endpoint_hash alloc\n");
status = -ENOMEM;
goto err_ehash_alloc;
}
@@ -1239,7 +1240,7 @@ SCTP_STATIC __init int sctp_init(void)
__get_free_pages(GFP_ATOMIC, order);
} while (!sctp_port_hashtable && --order > 0);
if (!sctp_port_hashtable) {
- printk(KERN_ERR "SCTP: Failed bind hash alloc.");
+ pr_err("Failed bind hash alloc\n");
status = -ENOMEM;
goto err_bhash_alloc;
}
@@ -1248,8 +1249,7 @@ SCTP_STATIC __init int sctp_init(void)
INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain);
}
- printk(KERN_INFO "SCTP: Hash tables configured "
- "(established %d bind %d)\n",
+ pr_info("Hash tables configured (established %d bind %d)\n",
sctp_assoc_hashsize, sctp_port_hashsize);
/* Disable ADDIP by default. */
@@ -1290,8 +1290,7 @@ SCTP_STATIC __init int sctp_init(void)
/* Initialize the control inode/socket for handling OOTB packets. */
if ((status = sctp_ctl_sock_init())) {
- printk (KERN_ERR
- "SCTP: Failed to initialize the SCTP control sock.\n");
+ pr_err("Failed to initialize the SCTP control sock\n");
goto err_ctl_sock_init;
}
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 246f9292465..2cc46f0962c 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -50,6 +50,8 @@
* be incorporated into the next SCTP release.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/ip.h>
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index f5e5e27cac5..b21b218d564 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -47,6 +47,8 @@
* be incorporated into the next SCTP release.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/skbuff.h>
#include <linux/types.h>
#include <linux/socket.h>
@@ -1146,26 +1148,23 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
case SCTP_DISPOSITION_VIOLATION:
if (net_ratelimit())
- printk(KERN_ERR "sctp protocol violation state %d "
- "chunkid %d\n", state, subtype.chunk);
+ pr_err("protocol violation state %d chunkid %d\n",
+ state, subtype.chunk);
break;
case SCTP_DISPOSITION_NOT_IMPL:
- printk(KERN_WARNING "sctp unimplemented feature in state %d, "
- "event_type %d, event_id %d\n",
- state, event_type, subtype.chunk);
+ pr_warn("unimplemented feature in state %d, event_type %d, event_id %d\n",
+ state, event_type, subtype.chunk);
break;
case SCTP_DISPOSITION_BUG:
- printk(KERN_ERR "sctp bug in state %d, "
- "event_type %d, event_id %d\n",
+ pr_err("bug in state %d, event_type %d, event_id %d\n",
state, event_type, subtype.chunk);
BUG();
break;
default:
- printk(KERN_ERR "sctp impossible disposition %d "
- "in state %d, event_type %d, event_id %d\n",
+ pr_err("impossible disposition %d in state %d, event_type %d, event_id %d\n",
status, state, event_type, subtype.chunk);
BUG();
break;
@@ -1679,8 +1678,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
sctp_cmd_send_asconf(asoc);
break;
default:
- printk(KERN_WARNING "Impossible command: %u, %p\n",
- cmd->verb, cmd->obj.ptr);
+ pr_warn("Impossible command: %u, %p\n",
+ cmd->verb, cmd->obj.ptr);
break;
}
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 24b2cd55563..4b4eb7c96bb 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -50,6 +50,8 @@
* be incorporated into the next SCTP release.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/ip.h>
@@ -1138,18 +1140,16 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep,
if (unlikely(!link)) {
if (from_addr.sa.sa_family == AF_INET6) {
if (net_ratelimit())
- printk(KERN_WARNING
- "%s association %p could not find address %pI6\n",
- __func__,
- asoc,
- &from_addr.v6.sin6_addr);
+ pr_warn("%s association %p could not find address %pI6\n",
+ __func__,
+ asoc,
+ &from_addr.v6.sin6_addr);
} else {
if (net_ratelimit())
- printk(KERN_WARNING
- "%s association %p could not find address %pI4\n",
- __func__,
- asoc,
- &from_addr.v4.sin_addr.s_addr);
+ pr_warn("%s association %p could not find address %pI4\n",
+ __func__,
+ asoc,
+ &from_addr.v4.sin_addr.s_addr);
}
return SCTP_DISPOSITION_DISCARD;
}
@@ -1232,6 +1232,18 @@ out:
return 0;
}
+static bool list_has_sctp_addr(const struct list_head *list,
+ union sctp_addr *ipaddr)
+{
+ struct sctp_transport *addr;
+
+ list_for_each_entry(addr, list, transports) {
+ if (sctp_cmp_addr_exact(ipaddr, &addr->ipaddr))
+ return true;
+ }
+
+ return false;
+}
/* A restart is occurring, check to make sure no new addresses
* are being added as we may be under a takeover attack.
*/
@@ -1240,10 +1252,10 @@ static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc,
struct sctp_chunk *init,
sctp_cmd_seq_t *commands)
{
- struct sctp_transport *new_addr, *addr;
- int found;
+ struct sctp_transport *new_addr;
+ int ret = 1;
- /* Implementor's Guide - Sectin 5.2.2
+ /* Implementor's Guide - Section 5.2.2
* ...
* Before responding the endpoint MUST check to see if the
* unexpected INIT adds new addresses to the association. If new
@@ -1254,31 +1266,19 @@ static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc,
/* Search through all current addresses and make sure
* we aren't adding any new ones.
*/
- new_addr = NULL;
- found = 0;
-
list_for_each_entry(new_addr, &new_asoc->peer.transport_addr_list,
- transports) {
- found = 0;
- list_for_each_entry(addr, &asoc->peer.transport_addr_list,
- transports) {
- if (sctp_cmp_addr_exact(&new_addr->ipaddr,
- &addr->ipaddr)) {
- found = 1;
- break;
- }
- }
- if (!found)
+ transports) {
+ if (!list_has_sctp_addr(&asoc->peer.transport_addr_list,
+ &new_addr->ipaddr)) {
+ sctp_sf_send_restart_abort(&new_addr->ipaddr, init,
+ commands);
+ ret = 0;
break;
- }
-
- /* If a new address was added, ABORT the sender. */
- if (!found && new_addr) {
- sctp_sf_send_restart_abort(&new_addr->ipaddr, init, commands);
+ }
}
/* Return success if all addresses were found. */
- return found;
+ return ret;
}
/* Populate the verification/tie tags based on overlapping INIT
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 6d9b3aafcc5..546d4387fb3 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -46,6 +46,8 @@
* be incorporated into the next SCTP release.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/skbuff.h>
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>
@@ -66,15 +68,19 @@ static const sctp_sm_table_entry_t bug = {
.name = "sctp_sf_bug"
};
-#define DO_LOOKUP(_max, _type, _table) \
- if ((event_subtype._type > (_max))) { \
- printk(KERN_WARNING \
- "sctp table %p possible attack:" \
- " event %d exceeds max %d\n", \
- _table, event_subtype._type, _max); \
- return &bug; \
- } \
- return &_table[event_subtype._type][(int)state];
+#define DO_LOOKUP(_max, _type, _table) \
+({ \
+ const sctp_sm_table_entry_t *rtn; \
+ \
+ if ((event_subtype._type > (_max))) { \
+ pr_warn("table %p possible attack: event %d exceeds max %d\n", \
+ _table, event_subtype._type, _max); \
+ rtn = &bug; \
+ } else \
+ rtn = &_table[event_subtype._type][(int)state]; \
+ \
+ rtn; \
+})
const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type,
sctp_state_t state,
@@ -83,21 +89,15 @@ const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type,
switch (event_type) {
case SCTP_EVENT_T_CHUNK:
return sctp_chunk_event_lookup(event_subtype.chunk, state);
- break;
case SCTP_EVENT_T_TIMEOUT:
- DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout,
- timeout_event_table);
- break;
-
+ return DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout,
+ timeout_event_table);
case SCTP_EVENT_T_OTHER:
- DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other, other_event_table);
- break;
-
+ return DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other,
+ other_event_table);
case SCTP_EVENT_T_PRIMITIVE:
- DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive,
- primitive_event_table);
- break;
-
+ return DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive,
+ primitive_event_table);
default:
/* Yikes! We got an illegal event type. */
return &bug;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index ca44917872d..e34ca9cc116 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -57,6 +57,8 @@
* be incorporated into the next SCTP release.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/wait.h>
@@ -916,6 +918,11 @@ SCTP_STATIC int sctp_setsockopt_bindx(struct sock* sk,
/* Walk through the addrs buffer and count the number of addresses. */
addr_buf = kaddrs;
while (walk_size < addrs_size) {
+ if (walk_size + sizeof(sa_family_t) > addrs_size) {
+ kfree(kaddrs);
+ return -EINVAL;
+ }
+
sa_addr = (struct sockaddr *)addr_buf;
af = sctp_get_af_specific(sa_addr->sa_family);
@@ -1002,9 +1009,13 @@ static int __sctp_connect(struct sock* sk,
/* Walk through the addrs buffer and count the number of addresses. */
addr_buf = kaddrs;
while (walk_size < addrs_size) {
+ if (walk_size + sizeof(sa_family_t) > addrs_size) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
sa_addr = (union sctp_addr *)addr_buf;
af = sctp_get_af_specific(sa_addr->sa.sa_family);
- port = ntohs(sa_addr->v4.sin_port);
/* If the address family is not supported or if this address
* causes the address buffer to overflow return EINVAL.
@@ -1014,6 +1025,8 @@ static int __sctp_connect(struct sock* sk,
goto out_free;
}
+ port = ntohs(sa_addr->v4.sin_port);
+
/* Save current address so we can work with it */
memcpy(&to, sa_addr, af->sockaddr_len);
@@ -2458,9 +2471,8 @@ static int sctp_setsockopt_delayed_ack(struct sock *sk,
if (params.sack_delay == 0 && params.sack_freq == 0)
return 0;
} else if (optlen == sizeof(struct sctp_assoc_value)) {
- printk(KERN_WARNING "SCTP: Use of struct sctp_assoc_value "
- "in delayed_ack socket option deprecated\n");
- printk(KERN_WARNING "SCTP: Use struct sctp_sack_info instead\n");
+ pr_warn("Use of struct sctp_assoc_value in delayed_ack socket option deprecated\n");
+ pr_warn("Use struct sctp_sack_info instead\n");
if (copy_from_user(&params, optval, optlen))
return -EFAULT;
@@ -2868,10 +2880,8 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
int val;
if (optlen == sizeof(int)) {
- printk(KERN_WARNING
- "SCTP: Use of int in maxseg socket option deprecated\n");
- printk(KERN_WARNING
- "SCTP: Use struct sctp_assoc_value instead\n");
+ pr_warn("Use of int in maxseg socket option deprecated\n");
+ pr_warn("Use struct sctp_assoc_value instead\n");
if (copy_from_user(&val, optval, optlen))
return -EFAULT;
params.assoc_id = 0;
@@ -3121,10 +3131,8 @@ static int sctp_setsockopt_maxburst(struct sock *sk,
int assoc_id = 0;
if (optlen == sizeof(int)) {
- printk(KERN_WARNING
- "SCTP: Use of int in max_burst socket option deprecated\n");
- printk(KERN_WARNING
- "SCTP: Use struct sctp_assoc_value instead\n");
+ pr_warn("Use of int in max_burst socket option deprecated\n");
+ pr_warn("Use struct sctp_assoc_value instead\n");
if (copy_from_user(&val, optval, optlen))
return -EFAULT;
} else if (optlen == sizeof(struct sctp_assoc_value)) {
@@ -3595,7 +3603,40 @@ out:
/* The SCTP ioctl handler. */
SCTP_STATIC int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
- return -ENOIOCTLCMD;
+ int rc = -ENOTCONN;
+
+ sctp_lock_sock(sk);
+
+ /*
+ * SEQPACKET-style sockets in LISTENING state are valid, for
+ * SCTP, so only discard TCP-style sockets in LISTENING state.
+ */
+ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
+ goto out;
+
+ switch (cmd) {
+ case SIOCINQ: {
+ struct sk_buff *skb;
+ unsigned int amount = 0;
+
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb != NULL) {
+ /*
+ * We will only return the amount of this packet since
+ * that is all that will be read.
+ */
+ amount = skb->len;
+ }
+ rc = put_user(amount, (int __user *)arg);
+ break;
+ }
+ default:
+ rc = -ENOIOCTLCMD;
+ break;
+ }
+out:
+ sctp_release_sock(sk);
+ return rc;
}
/* This is the function which gets called during socket creation to
@@ -3854,7 +3895,7 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
}
out:
- return (retval);
+ return retval;
}
@@ -3910,7 +3951,7 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
}
out:
- return (retval);
+ return retval;
}
/* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS)
@@ -4281,9 +4322,8 @@ static int sctp_getsockopt_delayed_ack(struct sock *sk, int len,
if (copy_from_user(&params, optval, len))
return -EFAULT;
} else if (len == sizeof(struct sctp_assoc_value)) {
- printk(KERN_WARNING "SCTP: Use of struct sctp_assoc_value "
- "in delayed_ack socket option deprecated\n");
- printk(KERN_WARNING "SCTP: Use struct sctp_sack_info instead\n");
+ pr_warn("Use of struct sctp_assoc_value in delayed_ack socket option deprecated\n");
+ pr_warn("Use struct sctp_sack_info instead\n");
if (copy_from_user(&params, optval, len))
return -EFAULT;
} else
@@ -4929,10 +4969,8 @@ static int sctp_getsockopt_maxseg(struct sock *sk, int len,
struct sctp_association *asoc;
if (len == sizeof(int)) {
- printk(KERN_WARNING
- "SCTP: Use of int in maxseg socket option deprecated\n");
- printk(KERN_WARNING
- "SCTP: Use struct sctp_assoc_value instead\n");
+ pr_warn("Use of int in maxseg socket option deprecated\n");
+ pr_warn("Use struct sctp_assoc_value instead\n");
params.assoc_id = 0;
} else if (len >= sizeof(struct sctp_assoc_value)) {
len = sizeof(struct sctp_assoc_value);
@@ -5023,10 +5061,8 @@ static int sctp_getsockopt_maxburst(struct sock *sk, int len,
struct sctp_association *asoc;
if (len == sizeof(int)) {
- printk(KERN_WARNING
- "SCTP: Use of int in max_burst socket option deprecated\n");
- printk(KERN_WARNING
- "SCTP: Use struct sctp_assoc_value instead\n");
+ pr_warn("Use of int in max_burst socket option deprecated\n");
+ pr_warn("Use struct sctp_assoc_value instead\n");
params.assoc_id = 0;
} else if (len >= sizeof(struct sctp_assoc_value)) {
len = sizeof(struct sctp_assoc_value);
@@ -5569,7 +5605,7 @@ static int sctp_get_port(struct sock *sk, unsigned short snum)
/* Note: sk->sk_num gets filled in if ephemeral port request. */
ret = sctp_get_port_local(sk, &addr);
- return (ret ? 1 : 0);
+ return ret ? 1 : 0;
}
/*
@@ -5586,8 +5622,7 @@ SCTP_STATIC int sctp_listen_start(struct sock *sk, int backlog)
tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm)) {
if (net_ratelimit()) {
- printk(KERN_INFO
- "SCTP: failed to load transform for %s: %ld\n",
+ pr_info("failed to load transform for %s: %ld\n",
sctp_hmac_alg, PTR_ERR(tfm));
}
return -ENOSYS;
@@ -5716,13 +5751,12 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
mask |= POLLERR;
if (sk->sk_shutdown & RCV_SHUTDOWN)
- mask |= POLLRDHUP;
+ mask |= POLLRDHUP | POLLIN | POLLRDNORM;
if (sk->sk_shutdown == SHUTDOWN_MASK)
mask |= POLLHUP;
/* Is it readable? Reconsider this code with TCP-style support. */
- if (!skb_queue_empty(&sk->sk_receive_queue) ||
- (sk->sk_shutdown & RCV_SHUTDOWN))
+ if (!skb_queue_empty(&sk->sk_receive_queue))
mask |= POLLIN | POLLRDNORM;
/* The association is either gone or not ready. */
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 132046cb82f..d3ae493d234 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -48,6 +48,8 @@
* be incorporated into the next SCTP release.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/random.h>
@@ -244,10 +246,9 @@ void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
struct dst_entry *dst;
if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
- printk(KERN_WARNING "%s: Reported pmtu %d too low, "
- "using default minimum of %d\n",
- __func__, pmtu,
- SCTP_DEFAULT_MINSEGMENT);
+ pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n",
+ __func__, pmtu,
+ SCTP_DEFAULT_MINSEGMENT);
/* Use default minimum segment size and disable
* pmtu discovery on this transport.
*/
diff --git a/net/socket.c b/net/socket.c
index 2270b941bcc..abf3e256152 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -209,8 +209,8 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr)
* specified. Zero is returned for a success.
*/
-int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uaddr,
- int __user *ulen)
+static int move_addr_to_user(struct sockaddr *kaddr, int klen,
+ void __user *uaddr, int __user *ulen)
{
int err;
int len;
@@ -502,6 +502,7 @@ static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
const struct file_operations bad_sock_fops = {
.owner = THIS_MODULE,
.open = sock_no_open,
+ .llseek = noop_llseek,
};
/**
@@ -535,14 +536,13 @@ void sock_release(struct socket *sock)
}
EXPORT_SYMBOL(sock_release);
-int sock_tx_timestamp(struct msghdr *msg, struct sock *sk,
- union skb_shared_tx *shtx)
+int sock_tx_timestamp(struct sock *sk, __u8 *tx_flags)
{
- shtx->flags = 0;
+ *tx_flags = 0;
if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
- shtx->hardware = 1;
+ *tx_flags |= SKBTX_HW_TSTAMP;
if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
- shtx->software = 1;
+ *tx_flags |= SKBTX_SW_TSTAMP;
return 0;
}
EXPORT_SYMBOL(sock_tx_timestamp);
@@ -662,7 +662,8 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
}
EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
-inline void sock_recv_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
+static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb)
{
if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && skb->dropcount)
put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL,
@@ -1919,7 +1920,8 @@ SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned, flags)
* Afterwards, it will be a kernel pointer. Thus the compiler-assisted
* checking falls down on this.
*/
- if (copy_from_user(ctl_buf, (void __user *)msg_sys.msg_control,
+ if (copy_from_user(ctl_buf,
+ (void __user __force *)msg_sys.msg_control,
ctl_len))
goto out_freectl;
msg_sys.msg_control = ctl_buf;
@@ -3054,14 +3056,19 @@ int kernel_getsockopt(struct socket *sock, int level, int optname,
char *optval, int *optlen)
{
mm_segment_t oldfs = get_fs();
+ char __user *uoptval;
+ int __user *uoptlen;
int err;
+ uoptval = (char __user __force *) optval;
+ uoptlen = (int __user __force *) optlen;
+
set_fs(KERNEL_DS);
if (level == SOL_SOCKET)
- err = sock_getsockopt(sock, level, optname, optval, optlen);
+ err = sock_getsockopt(sock, level, optname, uoptval, uoptlen);
else
- err = sock->ops->getsockopt(sock, level, optname, optval,
- optlen);
+ err = sock->ops->getsockopt(sock, level, optname, uoptval,
+ uoptlen);
set_fs(oldfs);
return err;
}
@@ -3071,13 +3078,16 @@ int kernel_setsockopt(struct socket *sock, int level, int optname,
char *optval, unsigned int optlen)
{
mm_segment_t oldfs = get_fs();
+ char __user *uoptval;
int err;
+ uoptval = (char __user __force *) optval;
+
set_fs(KERNEL_DS);
if (level == SOL_SOCKET)
- err = sock_setsockopt(sock, level, optname, optval, optlen);
+ err = sock_setsockopt(sock, level, optname, uoptval, optlen);
else
- err = sock->ops->setsockopt(sock, level, optname, optval,
+ err = sock->ops->setsockopt(sock, level, optname, uoptval,
optlen);
set_fs(oldfs);
return err;
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 443c161eb8b..3376d765718 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -18,10 +18,11 @@ config SUNRPC_XPRT_RDMA
If unsure, say N.
config RPCSEC_GSS_KRB5
- tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
- depends on SUNRPC && EXPERIMENTAL
+ tristate
+ depends on SUNRPC && CRYPTO
+ prompt "Secure RPC: Kerberos V mechanism" if !(NFS_V4 || NFSD_V4)
+ default y
select SUNRPC_GSS
- select CRYPTO
select CRYPTO_MD5
select CRYPTO_DES
select CRYPTO_CBC
@@ -34,7 +35,7 @@ config RPCSEC_GSS_KRB5
available from http://linux-nfs.org/. In addition, user-space
Kerberos support should be installed.
- If unsure, say N.
+ If unsure, say Y.
config RPCSEC_GSS_SPKM3
tristate "Secure RPC: SPKM3 mechanism (EXPERIMENTAL)"
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 880d0de3f50..e9eaaf7d43c 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -38,8 +38,8 @@ static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
static LIST_HEAD(cred_unused);
static unsigned long number_cred_unused;
-#define MAX_HASHTABLE_BITS (10)
-static int param_set_hashtbl_sz(const char *val, struct kernel_param *kp)
+#define MAX_HASHTABLE_BITS (14)
+static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
{
unsigned long num;
unsigned int nbits;
@@ -61,7 +61,7 @@ out_inval:
return -EINVAL;
}
-static int param_get_hashtbl_sz(char *buffer, struct kernel_param *kp)
+static int param_get_hashtbl_sz(char *buffer, const struct kernel_param *kp)
{
unsigned int nbits;
@@ -71,6 +71,11 @@ static int param_get_hashtbl_sz(char *buffer, struct kernel_param *kp)
#define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int);
+static struct kernel_param_ops param_ops_hashtbl_sz = {
+ .set = param_set_hashtbl_sz,
+ .get = param_get_hashtbl_sz,
+};
+
module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644);
MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size");
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index dcfc66bab2b..3835ce35e22 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -745,17 +745,18 @@ gss_pipe_release(struct inode *inode)
struct rpc_inode *rpci = RPC_I(inode);
struct gss_upcall_msg *gss_msg;
+restart:
spin_lock(&inode->i_lock);
- while (!list_empty(&rpci->in_downcall)) {
+ list_for_each_entry(gss_msg, &rpci->in_downcall, list) {
- gss_msg = list_entry(rpci->in_downcall.next,
- struct gss_upcall_msg, list);
+ if (!list_empty(&gss_msg->msg.list))
+ continue;
gss_msg->msg.errno = -EPIPE;
atomic_inc(&gss_msg->count);
__gss_unhash_msg(gss_msg);
spin_unlock(&inode->i_lock);
gss_release_msg(gss_msg);
- spin_lock(&inode->i_lock);
+ goto restart;
}
spin_unlock(&inode->i_lock);
@@ -1049,7 +1050,7 @@ gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags)
out:
if (acred->machine_cred != gss_cred->gc_machine_cred)
return 0;
- return (rc->cr_uid == acred->uid);
+ return rc->cr_uid == acred->uid;
}
/*
diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c
index 310b78e9945..c586e92bcf7 100644
--- a/net/sunrpc/auth_gss/gss_generic_token.c
+++ b/net/sunrpc/auth_gss/gss_generic_token.c
@@ -76,19 +76,19 @@ static int
der_length_size( int length)
{
if (length < (1<<7))
- return(1);
+ return 1;
else if (length < (1<<8))
- return(2);
+ return 2;
#if (SIZEOF_INT == 2)
else
- return(3);
+ return 3;
#else
else if (length < (1<<16))
- return(3);
+ return 3;
else if (length < (1<<24))
- return(4);
+ return 4;
else
- return(5);
+ return 5;
#endif
}
@@ -121,14 +121,14 @@ der_read_length(unsigned char **buf, int *bufsize)
int ret;
if (*bufsize < 1)
- return(-1);
+ return -1;
sf = *(*buf)++;
(*bufsize)--;
if (sf & 0x80) {
if ((sf &= 0x7f) > ((*bufsize)-1))
- return(-1);
+ return -1;
if (sf > SIZEOF_INT)
- return (-1);
+ return -1;
ret = 0;
for (; sf; sf--) {
ret = (ret<<8) + (*(*buf)++);
@@ -138,7 +138,7 @@ der_read_length(unsigned char **buf, int *bufsize)
ret = sf;
}
- return(ret);
+ return ret;
}
/* returns the length of a token, given the mech oid and the body size */
@@ -148,7 +148,7 @@ g_token_size(struct xdr_netobj *mech, unsigned int body_size)
{
/* set body_size to sequence contents size */
body_size += 2 + (int) mech->len; /* NEED overflow check */
- return(1 + der_length_size(body_size) + body_size);
+ return 1 + der_length_size(body_size) + body_size;
}
EXPORT_SYMBOL_GPL(g_token_size);
@@ -186,27 +186,27 @@ g_verify_token_header(struct xdr_netobj *mech, int *body_size,
int ret = 0;
if ((toksize-=1) < 0)
- return(G_BAD_TOK_HEADER);
+ return G_BAD_TOK_HEADER;
if (*buf++ != 0x60)
- return(G_BAD_TOK_HEADER);
+ return G_BAD_TOK_HEADER;
if ((seqsize = der_read_length(&buf, &toksize)) < 0)
- return(G_BAD_TOK_HEADER);
+ return G_BAD_TOK_HEADER;
if (seqsize != toksize)
- return(G_BAD_TOK_HEADER);
+ return G_BAD_TOK_HEADER;
if ((toksize-=1) < 0)
- return(G_BAD_TOK_HEADER);
+ return G_BAD_TOK_HEADER;
if (*buf++ != 0x06)
- return(G_BAD_TOK_HEADER);
+ return G_BAD_TOK_HEADER;
if ((toksize-=1) < 0)
- return(G_BAD_TOK_HEADER);
+ return G_BAD_TOK_HEADER;
toid.len = *buf++;
if ((toksize-=toid.len) < 0)
- return(G_BAD_TOK_HEADER);
+ return G_BAD_TOK_HEADER;
toid.data = buf;
buf+=toid.len;
@@ -217,17 +217,17 @@ g_verify_token_header(struct xdr_netobj *mech, int *body_size,
to return G_BAD_TOK_HEADER if the token header is in fact bad */
if ((toksize-=2) < 0)
- return(G_BAD_TOK_HEADER);
+ return G_BAD_TOK_HEADER;
if (ret)
- return(ret);
+ return ret;
if (!ret) {
*buf_in = buf;
*body_size = toksize;
}
- return(ret);
+ return ret;
}
EXPORT_SYMBOL_GPL(g_verify_token_header);
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 03264461052..778e5dfc514 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -237,6 +237,7 @@ get_key(const void *p, const void *end,
if (!supported_gss_krb5_enctype(alg)) {
printk(KERN_WARNING "gss_kerberos_mech: unsupported "
"encryption key algorithm %d\n", alg);
+ p = ERR_PTR(-EINVAL);
goto out_err;
}
p = simple_get_netobj(p, end, &key);
@@ -282,15 +283,19 @@ gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx)
ctx->enctype = ENCTYPE_DES_CBC_RAW;
ctx->gk5e = get_gss_krb5_enctype(ctx->enctype);
- if (ctx->gk5e == NULL)
+ if (ctx->gk5e == NULL) {
+ p = ERR_PTR(-EINVAL);
goto out_err;
+ }
/* The downcall format was designed before we completely understood
* the uses of the context fields; so it includes some stuff we
* just give some minimal sanity-checking, and some we ignore
* completely (like the next twenty bytes): */
- if (unlikely(p + 20 > end || p + 20 < p))
+ if (unlikely(p + 20 > end || p + 20 < p)) {
+ p = ERR_PTR(-EFAULT);
goto out_err;
+ }
p += 20;
p = simple_get_bytes(p, end, &tmp, sizeof(tmp));
if (IS_ERR(p))
@@ -619,6 +624,7 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx,
if (ctx->seq_send64 != ctx->seq_send) {
dprintk("%s: seq_send64 %lx, seq_send %x overflow?\n", __func__,
(long unsigned)ctx->seq_send64, ctx->seq_send);
+ p = ERR_PTR(-EINVAL);
goto out_err;
}
p = simple_get_bytes(p, end, &ctx->enctype, sizeof(ctx->enctype));
diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
index 415c013ba38..62ac90c62cb 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
@@ -162,5 +162,5 @@ krb5_get_seq_num(struct krb5_ctx *kctx,
*seqnum = ((plain[0]) |
(plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24));
- return (0);
+ return 0;
}
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 2689de39dc7..8b4061049d7 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -331,7 +331,7 @@ gss_delete_sec_context(struct gss_ctx **context_handle)
*context_handle);
if (!*context_handle)
- return(GSS_S_NO_CONTEXT);
+ return GSS_S_NO_CONTEXT;
if ((*context_handle)->internal_ctx_id)
(*context_handle)->mech_type->gm_ops
->gss_delete_sec_context((*context_handle)
diff --git a/net/sunrpc/auth_gss/gss_spkm3_mech.c b/net/sunrpc/auth_gss/gss_spkm3_mech.c
index dc3f1f5ed86..adade3d313f 100644
--- a/net/sunrpc/auth_gss/gss_spkm3_mech.c
+++ b/net/sunrpc/auth_gss/gss_spkm3_mech.c
@@ -100,6 +100,7 @@ gss_import_sec_context_spkm3(const void *p, size_t len,
if (version != 1) {
dprintk("RPC: unknown spkm3 token format: "
"obsolete nfs-utils?\n");
+ p = ERR_PTR(-EINVAL);
goto out_err_free_ctx;
}
@@ -135,8 +136,10 @@ gss_import_sec_context_spkm3(const void *p, size_t len,
if (IS_ERR(p))
goto out_err_free_intg_alg;
- if (p != end)
+ if (p != end) {
+ p = ERR_PTR(-EFAULT);
goto out_err_free_intg_key;
+ }
ctx_id->internal_ctx_id = ctx;
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 2b06410e584..7dce81a926c 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -28,7 +28,6 @@
#include <linux/workqueue.h>
#include <linux/mutex.h>
#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
#include <asm/ioctls.h>
#include <linux/sunrpc/types.h>
#include <linux/sunrpc/cache.h>
@@ -1348,15 +1347,10 @@ static unsigned int cache_poll_procfs(struct file *filp, poll_table *wait)
static long cache_ioctl_procfs(struct file *filp,
unsigned int cmd, unsigned long arg)
{
- long ret;
struct inode *inode = filp->f_path.dentry->d_inode;
struct cache_detail *cd = PDE(inode)->data;
- lock_kernel();
- ret = cache_ioctl(inode, filp, cmd, arg, cd);
- unlock_kernel();
-
- return ret;
+ return cache_ioctl(inode, filp, cmd, arg, cd);
}
static int cache_open_procfs(struct inode *inode, struct file *filp)
@@ -1441,6 +1435,7 @@ static const struct file_operations cache_flush_operations_procfs = {
.read = read_flush_procfs,
.write = write_flush_procfs,
.release = release_flush_procfs,
+ .llseek = no_llseek,
};
static void remove_cache_proc_entries(struct cache_detail *cd)
@@ -1555,13 +1550,8 @@ static long cache_ioctl_pipefs(struct file *filp,
{
struct inode *inode = filp->f_dentry->d_inode;
struct cache_detail *cd = RPC_I(inode)->private;
- long ret;
- lock_kernel();
- ret = cache_ioctl(inode, filp, cmd, arg, cd);
- unlock_kernel();
-
- return ret;
+ return cache_ioctl(inode, filp, cmd, arg, cd);
}
static int cache_open_pipefs(struct inode *inode, struct file *filp)
@@ -1646,6 +1636,7 @@ const struct file_operations cache_flush_operations_pipefs = {
.read = read_flush_pipefs,
.write = write_flush_pipefs,
.release = release_flush_pipefs,
+ .llseek = no_llseek,
};
int sunrpc_cache_register_pipefs(struct dentry *parent,
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2388d83b68f..fa5549079d7 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -226,7 +226,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
goto out_no_principal;
}
- kref_init(&clnt->cl_kref);
+ atomic_set(&clnt->cl_count, 1);
err = rpc_setup_pipedir(clnt, program->pipe_dir_name);
if (err < 0)
@@ -390,14 +390,14 @@ rpc_clone_client(struct rpc_clnt *clnt)
if (new->cl_principal == NULL)
goto out_no_principal;
}
- kref_init(&new->cl_kref);
+ atomic_set(&new->cl_count, 1);
err = rpc_setup_pipedir(new, clnt->cl_program->pipe_dir_name);
if (err != 0)
goto out_no_path;
if (new->cl_auth)
atomic_inc(&new->cl_auth->au_count);
xprt_get(clnt->cl_xprt);
- kref_get(&clnt->cl_kref);
+ atomic_inc(&clnt->cl_count);
rpc_register_client(new);
rpciod_up();
return new;
@@ -465,10 +465,8 @@ EXPORT_SYMBOL_GPL(rpc_shutdown_client);
* Free an RPC client
*/
static void
-rpc_free_client(struct kref *kref)
+rpc_free_client(struct rpc_clnt *clnt)
{
- struct rpc_clnt *clnt = container_of(kref, struct rpc_clnt, cl_kref);
-
dprintk("RPC: destroying %s client for %s\n",
clnt->cl_protname, clnt->cl_server);
if (!IS_ERR(clnt->cl_path.dentry)) {
@@ -495,12 +493,10 @@ out_free:
* Free an RPC client
*/
static void
-rpc_free_auth(struct kref *kref)
+rpc_free_auth(struct rpc_clnt *clnt)
{
- struct rpc_clnt *clnt = container_of(kref, struct rpc_clnt, cl_kref);
-
if (clnt->cl_auth == NULL) {
- rpc_free_client(kref);
+ rpc_free_client(clnt);
return;
}
@@ -509,10 +505,11 @@ rpc_free_auth(struct kref *kref)
* release remaining GSS contexts. This mechanism ensures
* that it can do so safely.
*/
- kref_init(kref);
+ atomic_inc(&clnt->cl_count);
rpcauth_release(clnt->cl_auth);
clnt->cl_auth = NULL;
- kref_put(kref, rpc_free_client);
+ if (atomic_dec_and_test(&clnt->cl_count))
+ rpc_free_client(clnt);
}
/*
@@ -525,7 +522,8 @@ rpc_release_client(struct rpc_clnt *clnt)
if (list_empty(&clnt->cl_tasks))
wake_up(&destroy_wait);
- kref_put(&clnt->cl_kref, rpc_free_auth);
+ if (atomic_dec_and_test(&clnt->cl_count))
+ rpc_free_auth(clnt);
}
/**
@@ -588,7 +586,7 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
if (clnt != NULL) {
rpc_task_release_client(task);
task->tk_client = clnt;
- kref_get(&clnt->cl_kref);
+ atomic_inc(&clnt->cl_count);
if (clnt->cl_softrtry)
task->tk_flags |= RPC_TASK_SOFT;
/* Add to the client's list of all tasks */
@@ -931,7 +929,7 @@ call_reserveresult(struct rpc_task *task)
task->tk_status = 0;
if (status >= 0) {
if (task->tk_rqstp) {
- task->tk_action = call_allocate;
+ task->tk_action = call_refresh;
return;
}
@@ -966,13 +964,54 @@ call_reserveresult(struct rpc_task *task)
}
/*
- * 2. Allocate the buffer. For details, see sched.c:rpc_malloc.
+ * 2. Bind and/or refresh the credentials
+ */
+static void
+call_refresh(struct rpc_task *task)
+{
+ dprint_status(task);
+
+ task->tk_action = call_refreshresult;
+ task->tk_status = 0;
+ task->tk_client->cl_stats->rpcauthrefresh++;
+ rpcauth_refreshcred(task);
+}
+
+/*
+ * 2a. Process the results of a credential refresh
+ */
+static void
+call_refreshresult(struct rpc_task *task)
+{
+ int status = task->tk_status;
+
+ dprint_status(task);
+
+ task->tk_status = 0;
+ task->tk_action = call_allocate;
+ if (status >= 0 && rpcauth_uptodatecred(task))
+ return;
+ switch (status) {
+ case -EACCES:
+ rpc_exit(task, -EACCES);
+ return;
+ case -ENOMEM:
+ rpc_exit(task, -ENOMEM);
+ return;
+ case -ETIMEDOUT:
+ rpc_delay(task, 3*HZ);
+ }
+ task->tk_action = call_refresh;
+}
+
+/*
+ * 2b. Allocate the buffer. For details, see sched.c:rpc_malloc.
* (Note: buffer memory is freed in xprt_release).
*/
static void
call_allocate(struct rpc_task *task)
{
- unsigned int slack = task->tk_client->cl_auth->au_cslack;
+ unsigned int slack = task->tk_rqstp->rq_cred->cr_auth->au_cslack;
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = task->tk_xprt;
struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
@@ -980,7 +1019,7 @@ call_allocate(struct rpc_task *task)
dprint_status(task);
task->tk_status = 0;
- task->tk_action = call_refresh;
+ task->tk_action = call_bind;
if (req->rq_buffer)
return;
@@ -1017,47 +1056,6 @@ call_allocate(struct rpc_task *task)
rpc_exit(task, -ERESTARTSYS);
}
-/*
- * 2a. Bind and/or refresh the credentials
- */
-static void
-call_refresh(struct rpc_task *task)
-{
- dprint_status(task);
-
- task->tk_action = call_refreshresult;
- task->tk_status = 0;
- task->tk_client->cl_stats->rpcauthrefresh++;
- rpcauth_refreshcred(task);
-}
-
-/*
- * 2b. Process the results of a credential refresh
- */
-static void
-call_refreshresult(struct rpc_task *task)
-{
- int status = task->tk_status;
-
- dprint_status(task);
-
- task->tk_status = 0;
- task->tk_action = call_bind;
- if (status >= 0 && rpcauth_uptodatecred(task))
- return;
- switch (status) {
- case -EACCES:
- rpc_exit(task, -EACCES);
- return;
- case -ENOMEM:
- rpc_exit(task, -ENOMEM);
- return;
- case -ETIMEDOUT:
- rpc_delay(task, 3*HZ);
- }
- task->tk_action = call_refresh;
-}
-
static inline int
rpc_task_need_encode(struct rpc_task *task)
{
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 1f7fc502a50..52f25243214 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -27,7 +27,6 @@
#include <linux/workqueue.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
#include <linux/sunrpc/cache.h>
-#include <linux/smp_lock.h>
static struct vfsmount *rpc_mount __read_mostly;
static int rpc_mount_count;
@@ -48,7 +47,7 @@ static void rpc_purge_list(struct rpc_inode *rpci, struct list_head *head,
return;
do {
msg = list_entry(head->next, struct rpc_pipe_msg, list);
- list_del(&msg->list);
+ list_del_init(&msg->list);
msg->errno = err;
destroy_msg(msg);
} while (!list_empty(head));
@@ -208,7 +207,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
if (msg != NULL) {
spin_lock(&inode->i_lock);
msg->errno = -EAGAIN;
- list_del(&msg->list);
+ list_del_init(&msg->list);
spin_unlock(&inode->i_lock);
rpci->ops->destroy_msg(msg);
}
@@ -268,7 +267,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
if (res < 0 || msg->len == msg->copied) {
filp->private_data = NULL;
spin_lock(&inode->i_lock);
- list_del(&msg->list);
+ list_del_init(&msg->list);
spin_unlock(&inode->i_lock);
rpci->ops->destroy_msg(msg);
}
@@ -309,40 +308,33 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait)
return mask;
}
-static int
-rpc_pipe_ioctl_unlocked(struct file *filp, unsigned int cmd, unsigned long arg)
+static long
+rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
- struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode);
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct rpc_inode *rpci = RPC_I(inode);
int len;
switch (cmd) {
case FIONREAD:
- if (rpci->ops == NULL)
+ spin_lock(&inode->i_lock);
+ if (rpci->ops == NULL) {
+ spin_unlock(&inode->i_lock);
return -EPIPE;
+ }
len = rpci->pipelen;
if (filp->private_data) {
struct rpc_pipe_msg *msg;
msg = filp->private_data;
len += msg->len - msg->copied;
}
+ spin_unlock(&inode->i_lock);
return put_user(len, (int __user *)arg);
default:
return -EINVAL;
}
}
-static long
-rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
- long ret;
-
- lock_kernel();
- ret = rpc_pipe_ioctl_unlocked(filp, cmd, arg);
- unlock_kernel();
-
- return ret;
-}
-
static const struct file_operations rpc_pipe_fops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
@@ -371,21 +363,23 @@ rpc_show_info(struct seq_file *m, void *v)
static int
rpc_info_open(struct inode *inode, struct file *file)
{
- struct rpc_clnt *clnt;
+ struct rpc_clnt *clnt = NULL;
int ret = single_open(file, rpc_show_info, NULL);
if (!ret) {
struct seq_file *m = file->private_data;
- mutex_lock(&inode->i_mutex);
- clnt = RPC_I(inode)->private;
- if (clnt) {
- kref_get(&clnt->cl_kref);
+
+ spin_lock(&file->f_path.dentry->d_lock);
+ if (!d_unhashed(file->f_path.dentry))
+ clnt = RPC_I(inode)->private;
+ if (clnt != NULL && atomic_inc_not_zero(&clnt->cl_count)) {
+ spin_unlock(&file->f_path.dentry->d_lock);
m->private = clnt;
} else {
+ spin_unlock(&file->f_path.dentry->d_lock);
single_release(inode, file);
ret = -EINVAL;
}
- mutex_unlock(&inode->i_mutex);
}
return ret;
}
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index cace6049e4a..aa5dbda6608 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -376,7 +376,7 @@ int rpc_queue_empty(struct rpc_wait_queue *queue)
spin_lock_bh(&queue->lock);
res = queue->qlen;
spin_unlock_bh(&queue->lock);
- return (res == 0);
+ return res == 0;
}
EXPORT_SYMBOL_GPL(rpc_queue_empty);
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index e5e28d1946a..2ac3f6e8adf 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -249,6 +249,8 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
req->rl_nchunks = nchunks;
BUG_ON(nchunks == 0);
+ BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
+ && (nchunks > 3));
/*
* finish off header. If write, marshal discrim and nchunks.
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 27015c6d8eb..5f4c7b3bc71 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -650,10 +650,22 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.cap.max_send_wr = cdata->max_requests;
switch (ia->ri_memreg_strategy) {
case RPCRDMA_FRMR:
- /* Add room for frmr register and invalidate WRs */
- ep->rep_attr.cap.max_send_wr *= 3;
- if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
- return -EINVAL;
+ /* Add room for frmr register and invalidate WRs.
+ * 1. FRMR reg WR for head
+ * 2. FRMR invalidate WR for head
+ * 3. FRMR reg WR for pagelist
+ * 4. FRMR invalidate WR for pagelist
+ * 5. FRMR reg WR for tail
+ * 6. FRMR invalidate WR for tail
+ * 7. The RDMA_SEND WR
+ */
+ ep->rep_attr.cap.max_send_wr *= 7;
+ if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
+ cdata->max_requests = devattr.max_qp_wr / 7;
+ if (!cdata->max_requests)
+ return -EINVAL;
+ ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
+ }
break;
case RPCRDMA_MEMWINDOWS_ASYNC:
case RPCRDMA_MEMWINDOWS:
@@ -1490,7 +1502,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
memset(&frmr_wr, 0, sizeof frmr_wr);
frmr_wr.opcode = IB_WR_FAST_REG_MR;
frmr_wr.send_flags = 0; /* unsignaled */
- frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma;
+ frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
frmr_wr.wr.fast_reg.page_list_len = i;
frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 7ca65c7005e..fe9306bf10c 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -800,7 +800,7 @@ static void xs_udp_data_ready(struct sock *sk, int len)
u32 _xid;
__be32 *xp;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
dprintk("RPC: xs_udp_data_ready...\n");
if (!(xprt = xprt_from_sock(sk)))
goto out;
@@ -852,7 +852,7 @@ static void xs_udp_data_ready(struct sock *sk, int len)
dropit:
skb_free_datagram(sk, skb);
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc)
@@ -1229,7 +1229,7 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes)
dprintk("RPC: xs_tcp_data_ready...\n");
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
if (!(xprt = xprt_from_sock(sk)))
goto out;
if (xprt->shutdown)
@@ -1248,7 +1248,7 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes)
read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
} while (read > 0);
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
/*
@@ -1301,18 +1301,19 @@ static void xs_tcp_state_change(struct sock *sk)
{
struct rpc_xprt *xprt;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
if (!(xprt = xprt_from_sock(sk)))
goto out;
dprintk("RPC: xs_tcp_state_change client %p...\n", xprt);
- dprintk("RPC: state %x conn %d dead %d zapped %d\n",
+ dprintk("RPC: state %x conn %d dead %d zapped %d sk_shutdown %d\n",
sk->sk_state, xprt_connected(xprt),
sock_flag(sk, SOCK_DEAD),
- sock_flag(sk, SOCK_ZAPPED));
+ sock_flag(sk, SOCK_ZAPPED),
+ sk->sk_shutdown);
switch (sk->sk_state) {
case TCP_ESTABLISHED:
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
if (!xprt_test_and_set_connected(xprt)) {
struct sock_xprt *transport = container_of(xprt,
struct sock_xprt, xprt);
@@ -1326,7 +1327,7 @@ static void xs_tcp_state_change(struct sock *sk)
xprt_wake_pending_tasks(xprt, -EAGAIN);
}
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
break;
case TCP_FIN_WAIT1:
/* The client initiated a shutdown of the socket */
@@ -1364,7 +1365,7 @@ static void xs_tcp_state_change(struct sock *sk)
xs_sock_mark_closed(xprt);
}
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
/**
@@ -1375,7 +1376,7 @@ static void xs_error_report(struct sock *sk)
{
struct rpc_xprt *xprt;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
if (!(xprt = xprt_from_sock(sk)))
goto out;
dprintk("RPC: %s client %p...\n"
@@ -1383,7 +1384,7 @@ static void xs_error_report(struct sock *sk)
__func__, xprt, sk->sk_err);
xprt_wake_pending_tasks(xprt, -EAGAIN);
out:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
static void xs_write_space(struct sock *sk)
@@ -1415,13 +1416,13 @@ static void xs_write_space(struct sock *sk)
*/
static void xs_udp_write_space(struct sock *sk)
{
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
/* from net/core/sock.c:sock_def_write_space */
if (sock_writeable(sk))
xs_write_space(sk);
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
/**
@@ -1436,13 +1437,13 @@ static void xs_udp_write_space(struct sock *sk)
*/
static void xs_tcp_write_space(struct sock *sk)
{
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
/* from net/core/stream.c:sk_stream_write_space */
if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
xs_write_space(sk);
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt)
@@ -1779,10 +1780,25 @@ static void xs_tcp_reuse_connection(struct rpc_xprt *xprt, struct sock_xprt *tra
{
unsigned int state = transport->inet->sk_state;
- if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED)
- return;
- if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT))
- return;
+ if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED) {
+ /* we don't need to abort the connection if the socket
+ * hasn't undergone a shutdown
+ */
+ if (transport->inet->sk_shutdown == 0)
+ return;
+ dprintk("RPC: %s: TCP_CLOSEd and sk_shutdown set to %d\n",
+ __func__, transport->inet->sk_shutdown);
+ }
+ if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT)) {
+ /* we don't need to abort the connection if the socket
+ * hasn't undergone a shutdown
+ */
+ if (transport->inet->sk_shutdown == 0)
+ return;
+ dprintk("RPC: %s: ESTABLISHED/SYN_SENT "
+ "sk_shutdown set to %d\n",
+ __func__, transport->inet->sk_shutdown);
+ }
xs_abort_connection(xprt, transport);
}
@@ -2577,7 +2593,8 @@ void cleanup_socket_xprt(void)
xprt_unregister_transport(&xs_bc_tcp_transport);
}
-static int param_set_uint_minmax(const char *val, struct kernel_param *kp,
+static int param_set_uint_minmax(const char *val,
+ const struct kernel_param *kp,
unsigned int min, unsigned int max)
{
unsigned long num;
@@ -2592,34 +2609,37 @@ static int param_set_uint_minmax(const char *val, struct kernel_param *kp,
return 0;
}
-static int param_set_portnr(const char *val, struct kernel_param *kp)
+static int param_set_portnr(const char *val, const struct kernel_param *kp)
{
return param_set_uint_minmax(val, kp,
RPC_MIN_RESVPORT,
RPC_MAX_RESVPORT);
}
-static int param_get_portnr(char *buffer, struct kernel_param *kp)
-{
- return param_get_uint(buffer, kp);
-}
+static struct kernel_param_ops param_ops_portnr = {
+ .set = param_set_portnr,
+ .get = param_get_uint,
+};
+
#define param_check_portnr(name, p) \
__param_check(name, p, unsigned int);
module_param_named(min_resvport, xprt_min_resvport, portnr, 0644);
module_param_named(max_resvport, xprt_max_resvport, portnr, 0644);
-static int param_set_slot_table_size(const char *val, struct kernel_param *kp)
+static int param_set_slot_table_size(const char *val,
+ const struct kernel_param *kp)
{
return param_set_uint_minmax(val, kp,
RPC_MIN_SLOT_TABLE,
RPC_MAX_SLOT_TABLE);
}
-static int param_get_slot_table_size(char *buffer, struct kernel_param *kp)
-{
- return param_get_uint(buffer, kp);
-}
+static struct kernel_param_ops param_ops_slot_table_size = {
+ .set = param_set_slot_table_size,
+ .get = param_get_uint,
+};
+
#define param_check_slot_table_size(name, p) \
__param_check(name, p, unsigned int);
diff --git a/net/tipc/addr.c b/net/tipc/addr.c
index c048543ffbe..8a2e89bffde 100644
--- a/net/tipc/addr.c
+++ b/net/tipc/addr.c
@@ -41,11 +41,6 @@
#include "cluster.h"
#include "net.h"
-u32 tipc_get_addr(void)
-{
- return tipc_own_addr;
-}
-
/**
* tipc_addr_domain_valid - validates a network domain address
*
@@ -89,7 +84,7 @@ int tipc_addr_domain_valid(u32 addr)
int tipc_addr_node_valid(u32 addr)
{
- return (tipc_addr_domain_valid(addr) && tipc_node(addr));
+ return tipc_addr_domain_valid(addr) && tipc_node(addr);
}
int tipc_in_scope(u32 domain, u32 addr)
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index a008c668930..22a60fc9839 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -121,6 +121,9 @@ static DEFINE_SPINLOCK(bc_lock);
const char tipc_bclink_name[] = "broadcast-link";
+static void tipc_nmap_diff(struct tipc_node_map *nm_a,
+ struct tipc_node_map *nm_b,
+ struct tipc_node_map *nm_diff);
static u32 buf_seqno(struct sk_buff *buf)
{
@@ -143,6 +146,19 @@ static void bcbuf_decr_acks(struct sk_buff *buf)
}
+static void bclink_set_last_sent(void)
+{
+ if (bcl->next_out)
+ bcl->fsm_msg_cnt = mod(buf_seqno(bcl->next_out) - 1);
+ else
+ bcl->fsm_msg_cnt = mod(bcl->next_out_no - 1);
+}
+
+u32 tipc_bclink_get_last_sent(void)
+{
+ return bcl->fsm_msg_cnt;
+}
+
/**
* bclink_set_gap - set gap according to contents of current deferred pkt queue
*
@@ -171,7 +187,7 @@ static void bclink_set_gap(struct tipc_node *n_ptr)
static int bclink_ack_allowed(u32 n)
{
- return((n % TIPC_MIN_LINK_WIN) == tipc_own_tag);
+ return (n % TIPC_MIN_LINK_WIN) == tipc_own_tag;
}
@@ -237,8 +253,10 @@ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked)
/* Try resolving broadcast link congestion, if necessary */
- if (unlikely(bcl->next_out))
+ if (unlikely(bcl->next_out)) {
tipc_link_push_queue(bcl);
+ bclink_set_last_sent();
+ }
if (unlikely(released && !list_empty(&bcl->waiting_ports)))
tipc_link_wakeup_ports(bcl, 0);
spin_unlock_bh(&bc_lock);
@@ -272,7 +290,7 @@ static void bclink_send_nack(struct tipc_node *n_ptr)
if (!less(n_ptr->bclink.gap_after, n_ptr->bclink.gap_to))
return;
- buf = buf_acquire(INT_H_SIZE);
+ buf = tipc_buf_acquire(INT_H_SIZE);
if (buf) {
msg = buf_msg(buf);
tipc_msg_init(msg, BCAST_PROTOCOL, STATE_MSG,
@@ -395,7 +413,7 @@ int tipc_bclink_send_msg(struct sk_buff *buf)
if (unlikely(res == -ELINKCONG))
buf_discard(buf);
else
- bcl->stats.sent_info++;
+ bclink_set_last_sent();
if (bcl->out_queue_size > bcl->stats.max_queue_sz)
bcl->stats.max_queue_sz = bcl->out_queue_size;
@@ -529,15 +547,6 @@ receive:
tipc_node_unlock(node);
}
-u32 tipc_bclink_get_last_sent(void)
-{
- u32 last_sent = mod(bcl->next_out_no - 1);
-
- if (bcl->next_out)
- last_sent = mod(buf_seqno(bcl->next_out) - 1);
- return last_sent;
-}
-
u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr)
{
return (n_ptr->bclink.supported &&
@@ -570,6 +579,7 @@ static int tipc_bcbearer_send(struct sk_buff *buf,
msg = buf_msg(buf);
msg_set_non_seq(msg, 1);
msg_set_mc_netid(msg, tipc_net_id);
+ bcl->stats.sent_info++;
}
/* Send buffer over bearers until all targets reached */
@@ -609,11 +619,13 @@ static int tipc_bcbearer_send(struct sk_buff *buf,
bcbearer->remains = bcbearer->remains_new;
}
- /* Unable to reach all targets */
+ /*
+ * Unable to reach all targets (indicate success, since currently
+ * there isn't code in place to properly block & unblock the
+ * pseudo-bearer used by the broadcast link)
+ */
- bcbearer->bearer.publ.blocked = 1;
- bcl->stats.bearer_congs++;
- return 1;
+ return TIPC_OK;
}
/**
@@ -862,8 +874,9 @@ void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node)
* @nm_diff: output node map A-B (i.e. nodes of A that are not in B)
*/
-void tipc_nmap_diff(struct tipc_node_map *nm_a, struct tipc_node_map *nm_b,
- struct tipc_node_map *nm_diff)
+static void tipc_nmap_diff(struct tipc_node_map *nm_a,
+ struct tipc_node_map *nm_b,
+ struct tipc_node_map *nm_diff)
{
int stop = ARRAY_SIZE(nm_a->map);
int w;
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index e8c2b81658c..011c03f0a4a 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -84,9 +84,6 @@ static inline int tipc_nmap_equal(struct tipc_node_map *nm_a, struct tipc_node_m
return !memcmp(nm_a, nm_b, sizeof(*nm_a));
}
-void tipc_nmap_diff(struct tipc_node_map *nm_a, struct tipc_node_map *nm_b,
- struct tipc_node_map *nm_diff);
-
void tipc_port_list_add(struct port_list *pl_ptr, u32 port);
void tipc_port_list_free(struct port_list *pl_ptr);
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 52ae17b2583..9927d1d56c4 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -63,7 +63,7 @@ static int media_name_valid(const char *name)
len = strlen(name);
if ((len + 1) > TIPC_MAX_MEDIA_NAME)
return 0;
- return (strspn(name, tipc_alphabet) == len);
+ return strspn(name, tipc_alphabet) == len;
}
/**
@@ -288,9 +288,6 @@ static struct bearer *bearer_find(const char *name)
struct bearer *b_ptr;
u32 i;
- if (tipc_mode != TIPC_NET_MODE)
- return NULL;
-
for (i = 0, b_ptr = tipc_bearers; i < MAX_BEARERS; i++, b_ptr++) {
if (b_ptr->active && (!strcmp(b_ptr->publ.name, name)))
return b_ptr;
@@ -559,8 +556,6 @@ restart:
}
b_ptr = &tipc_bearers[bearer_id];
- memset(b_ptr, 0, sizeof(struct bearer));
-
strcpy(b_ptr->publ.name, name);
res = m_ptr->enable_bearer(&b_ptr->publ);
if (res) {
@@ -630,30 +625,17 @@ int tipc_block_bearer(const char *name)
* Note: This routine assumes caller holds tipc_net_lock.
*/
-static int bearer_disable(const char *name)
+static int bearer_disable(struct bearer *b_ptr)
{
- struct bearer *b_ptr;
struct link *l_ptr;
struct link *temp_l_ptr;
- b_ptr = bearer_find(name);
- if (!b_ptr) {
- warn("Attempt to disable unknown bearer <%s>\n", name);
- return -EINVAL;
- }
-
- info("Disabling bearer <%s>\n", name);
+ info("Disabling bearer <%s>\n", b_ptr->publ.name);
tipc_disc_stop_link_req(b_ptr->link_req);
spin_lock_bh(&b_ptr->publ.lock);
b_ptr->link_req = NULL;
b_ptr->publ.blocked = 1;
- if (b_ptr->media->disable_bearer) {
- spin_unlock_bh(&b_ptr->publ.lock);
- write_unlock_bh(&tipc_net_lock);
- b_ptr->media->disable_bearer(&b_ptr->publ);
- write_lock_bh(&tipc_net_lock);
- spin_lock_bh(&b_ptr->publ.lock);
- }
+ b_ptr->media->disable_bearer(&b_ptr->publ);
list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) {
tipc_link_delete(l_ptr);
}
@@ -664,10 +646,16 @@ static int bearer_disable(const char *name)
int tipc_disable_bearer(const char *name)
{
+ struct bearer *b_ptr;
int res;
write_lock_bh(&tipc_net_lock);
- res = bearer_disable(name);
+ b_ptr = bearer_find(name);
+ if (b_ptr == NULL) {
+ warn("Attempt to disable unknown bearer <%s>\n", name);
+ res = -EINVAL;
+ } else
+ res = bearer_disable(b_ptr);
write_unlock_bh(&tipc_net_lock);
return res;
}
@@ -680,13 +668,7 @@ void tipc_bearer_stop(void)
for (i = 0; i < MAX_BEARERS; i++) {
if (tipc_bearers[i].active)
- tipc_bearers[i].publ.blocked = 1;
- }
- for (i = 0; i < MAX_BEARERS; i++) {
- if (tipc_bearers[i].active)
- bearer_disable(tipc_bearers[i].publ.name);
+ bearer_disable(&tipc_bearers[i]);
}
media_count = 0;
}
-
-
diff --git a/net/tipc/cluster.c b/net/tipc/cluster.c
index e68f705381b..7fea14b98b9 100644
--- a/net/tipc/cluster.c
+++ b/net/tipc/cluster.c
@@ -113,25 +113,6 @@ void tipc_cltr_delete(struct cluster *c_ptr)
kfree(c_ptr);
}
-u32 tipc_cltr_next_node(struct cluster *c_ptr, u32 addr)
-{
- struct tipc_node *n_ptr;
- u32 n_num = tipc_node(addr) + 1;
-
- if (!c_ptr)
- return addr;
- for (; n_num <= c_ptr->highest_node; n_num++) {
- n_ptr = c_ptr->nodes[n_num];
- if (n_ptr && tipc_node_has_active_links(n_ptr))
- return n_ptr->addr;
- }
- for (n_num = 1; n_num < tipc_node(addr); n_num++) {
- n_ptr = c_ptr->nodes[n_num];
- if (n_ptr && tipc_node_has_active_links(n_ptr))
- return n_ptr->addr;
- }
- return 0;
-}
void tipc_cltr_attach_node(struct cluster *c_ptr, struct tipc_node *n_ptr)
{
@@ -232,7 +213,7 @@ struct tipc_node *tipc_cltr_select_node(struct cluster *c_ptr, u32 selector)
static struct sk_buff *tipc_cltr_prepare_routing_msg(u32 data_size, u32 dest)
{
u32 size = INT_H_SIZE + data_size;
- struct sk_buff *buf = buf_acquire(size);
+ struct sk_buff *buf = tipc_buf_acquire(size);
struct tipc_msg *msg;
if (buf) {
diff --git a/net/tipc/cluster.h b/net/tipc/cluster.h
index 333efb0b9c4..32636d98c9c 100644
--- a/net/tipc/cluster.h
+++ b/net/tipc/cluster.h
@@ -75,7 +75,7 @@ void tipc_cltr_attach_node(struct cluster *c_ptr, struct tipc_node *n_ptr);
void tipc_cltr_send_slave_routes(struct cluster *c_ptr, u32 dest);
void tipc_cltr_broadcast(struct sk_buff *buf);
int tipc_cltr_init(void);
-u32 tipc_cltr_next_node(struct cluster *c_ptr, u32 addr);
+
void tipc_cltr_bcast_new_route(struct cluster *c_ptr, u32 dest, u32 lo, u32 hi);
void tipc_cltr_send_local_routes(struct cluster *c_ptr, u32 dest);
void tipc_cltr_bcast_lost_route(struct cluster *c_ptr, u32 dest, u32 lo, u32 hi);
diff --git a/net/tipc/config.c b/net/tipc/config.c
index 961d1b09714..50a6133a366 100644
--- a/net/tipc/config.c
+++ b/net/tipc/config.c
@@ -95,7 +95,7 @@ int tipc_cfg_append_tlv(struct sk_buff *buf, int tlv_type,
return 1;
}
-struct sk_buff *tipc_cfg_reply_unsigned_type(u16 tlv_type, u32 value)
+static struct sk_buff *tipc_cfg_reply_unsigned_type(u16 tlv_type, u32 value)
{
struct sk_buff *buf;
__be32 value_net;
@@ -109,6 +109,11 @@ struct sk_buff *tipc_cfg_reply_unsigned_type(u16 tlv_type, u32 value)
return buf;
}
+static struct sk_buff *tipc_cfg_reply_unsigned(u32 value)
+{
+ return tipc_cfg_reply_unsigned_type(TIPC_TLV_UNSIGNED, value);
+}
+
struct sk_buff *tipc_cfg_reply_string_type(u16 tlv_type, char *string)
{
struct sk_buff *buf;
@@ -120,139 +125,6 @@ struct sk_buff *tipc_cfg_reply_string_type(u16 tlv_type, char *string)
return buf;
}
-
-#if 0
-
-/* Now obsolete code for handling commands not yet implemented the new way */
-
-/*
- * Some of this code assumed that the manager structure contains two added
- * fields:
- * u32 link_subscriptions;
- * struct list_head link_subscribers;
- * which are currently not present. These fields may need to be re-introduced
- * if and when support for link subscriptions is added.
- */
-
-void tipc_cfg_link_event(u32 addr, char *name, int up)
-{
- /* TIPC DOESN'T HANDLE LINK EVENT SUBSCRIPTIONS AT THE MOMENT */
-}
-
-int tipc_cfg_cmd(const struct tipc_cmd_msg * msg,
- char *data,
- u32 sz,
- u32 *ret_size,
- struct tipc_portid *orig)
-{
- int rv = -EINVAL;
- u32 cmd = msg->cmd;
-
- *ret_size = 0;
- switch (cmd) {
- case TIPC_REMOVE_LINK:
- case TIPC_CMD_BLOCK_LINK:
- case TIPC_CMD_UNBLOCK_LINK:
- if (!cfg_check_connection(orig))
- rv = link_control(msg->argv.link_name, msg->cmd, 0);
- break;
- case TIPC_ESTABLISH:
- {
- int connected;
-
- tipc_isconnected(mng.conn_port_ref, &connected);
- if (connected || !orig) {
- rv = TIPC_FAILURE;
- break;
- }
- rv = tipc_connect2port(mng.conn_port_ref, orig);
- if (rv == TIPC_OK)
- orig = 0;
- break;
- }
- case TIPC_GET_PEER_ADDRESS:
- *ret_size = link_peer_addr(msg->argv.link_name, data, sz);
- break;
- case TIPC_GET_ROUTES:
- rv = TIPC_OK;
- break;
- default: {}
- }
- if (*ret_size)
- rv = TIPC_OK;
- return rv;
-}
-
-static void cfg_cmd_event(struct tipc_cmd_msg *msg,
- char *data,
- u32 sz,
- struct tipc_portid const *orig)
-{
- int rv = -EINVAL;
- struct tipc_cmd_result_msg rmsg;
- struct iovec msg_sect[2];
- int *arg;
-
- msg->cmd = ntohl(msg->cmd);
-
- cfg_prepare_res_msg(msg->cmd, msg->usr_handle, rv, &rmsg, msg_sect,
- data, 0);
- if (ntohl(msg->magic) != TIPC_MAGIC)
- goto exit;
-
- switch (msg->cmd) {
- case TIPC_CREATE_LINK:
- if (!cfg_check_connection(orig))
- rv = disc_create_link(&msg->argv.create_link);
- break;
- case TIPC_LINK_SUBSCRIBE:
- {
- struct subscr_data *sub;
-
- if (mng.link_subscriptions > 64)
- break;
- sub = kmalloc(sizeof(*sub),
- GFP_ATOMIC);
- if (sub == NULL) {
- warn("Memory squeeze; dropped remote link subscription\n");
- break;
- }
- INIT_LIST_HEAD(&sub->subd_list);
- tipc_createport(mng.user_ref,
- (void *)sub,
- TIPC_HIGH_IMPORTANCE,
- 0,
- 0,
- (tipc_conn_shutdown_event)cfg_linksubscr_cancel,
- 0,
- 0,
- (tipc_conn_msg_event)cfg_linksubscr_cancel,
- 0,
- &sub->port_ref);
- if (!sub->port_ref) {
- kfree(sub);
- break;
- }
- memcpy(sub->usr_handle,msg->usr_handle,
- sizeof(sub->usr_handle));
- sub->domain = msg->argv.domain;
- list_add_tail(&sub->subd_list, &mng.link_subscribers);
- tipc_connect2port(sub->port_ref, orig);
- rmsg.retval = TIPC_OK;
- tipc_send(sub->port_ref, 2u, msg_sect);
- mng.link_subscriptions++;
- return;
- }
- default:
- rv = tipc_cfg_cmd(msg, data, sz, (u32 *)&msg_sect[1].iov_len, orig);
- }
-exit:
- rmsg.result_len = htonl(msg_sect[1].iov_len);
- rmsg.retval = htonl(rv);
- tipc_cfg_respond(msg_sect, 2u, orig);
-}
-#endif
-
#define MAX_STATS_INFO 2000
static struct sk_buff *tipc_show_stats(void)
@@ -557,14 +429,6 @@ struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area
case TIPC_CMD_SHOW_PORTS:
rep_tlv_buf = tipc_port_get_ports();
break;
-#if 0
- case TIPC_CMD_SHOW_PORT_STATS:
- rep_tlv_buf = port_show_stats(req_tlv_area, req_tlv_space);
- break;
- case TIPC_CMD_RESET_PORT_STATS:
- rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED);
- break;
-#endif
case TIPC_CMD_SET_LOG_SIZE:
rep_tlv_buf = tipc_log_resize_cmd(req_tlv_area, req_tlv_space);
break;
diff --git a/net/tipc/config.h b/net/tipc/config.h
index 5cd7cc56c54..481e12ece71 100644
--- a/net/tipc/config.h
+++ b/net/tipc/config.h
@@ -45,7 +45,6 @@
struct sk_buff *tipc_cfg_reply_alloc(int payload_size);
int tipc_cfg_append_tlv(struct sk_buff *buf, int tlv_type,
void *tlv_data, int tlv_data_size);
-struct sk_buff *tipc_cfg_reply_unsigned_type(u16 tlv_type, u32 value);
struct sk_buff *tipc_cfg_reply_string_type(u16 tlv_type, char *string);
static inline struct sk_buff *tipc_cfg_reply_none(void)
@@ -53,11 +52,6 @@ static inline struct sk_buff *tipc_cfg_reply_none(void)
return tipc_cfg_reply_alloc(0);
}
-static inline struct sk_buff *tipc_cfg_reply_unsigned(u32 value)
-{
- return tipc_cfg_reply_unsigned_type(TIPC_TLV_UNSIGNED, value);
-}
-
static inline struct sk_buff *tipc_cfg_reply_error_string(char *string)
{
return tipc_cfg_reply_string_type(TIPC_TLV_ERROR_STRING, string);
diff --git a/net/tipc/core.c b/net/tipc/core.c
index 69646811798..e2a09eb8efd 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -96,13 +96,8 @@ int tipc_net_id;
int tipc_remote_management;
-int tipc_get_mode(void)
-{
- return tipc_mode;
-}
-
/**
- * buf_acquire - creates a TIPC message buffer
+ * tipc_buf_acquire - creates a TIPC message buffer
* @size: message size (including TIPC header)
*
* Returns a new buffer with data pointers set to the specified size.
@@ -111,7 +106,7 @@ int tipc_get_mode(void)
* There may also be unrequested tailroom present at the buffer's end.
*/
-struct sk_buff *buf_acquire(u32 size)
+struct sk_buff *tipc_buf_acquire(u32 size)
{
struct sk_buff *skb;
unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u;
@@ -129,7 +124,7 @@ struct sk_buff *buf_acquire(u32 size)
* tipc_core_stop_net - shut down TIPC networking sub-systems
*/
-void tipc_core_stop_net(void)
+static void tipc_core_stop_net(void)
{
tipc_eth_media_stop();
tipc_net_stop();
@@ -154,7 +149,7 @@ int tipc_core_start_net(unsigned long addr)
* tipc_core_stop - switch TIPC from SINGLE NODE to NOT RUNNING mode
*/
-void tipc_core_stop(void)
+static void tipc_core_stop(void)
{
if (tipc_mode != TIPC_NODE_MODE)
return;
@@ -169,13 +164,14 @@ void tipc_core_stop(void)
tipc_nametbl_stop();
tipc_ref_table_stop();
tipc_socket_stop();
+ tipc_log_resize(0);
}
/**
* tipc_core_start - switch TIPC from NOT RUNNING to SINGLE NODE mode
*/
-int tipc_core_start(void)
+static int tipc_core_start(void)
{
int res;
@@ -203,7 +199,9 @@ static int __init tipc_init(void)
{
int res;
- tipc_log_resize(CONFIG_TIPC_LOG);
+ if (tipc_log_resize(CONFIG_TIPC_LOG) != 0)
+ warn("Unable to create log buffer\n");
+
info("Activated (version " TIPC_MOD_VER
" compiled " __DATE__ " " __TIME__ ")\n");
@@ -230,7 +228,6 @@ static void __exit tipc_exit(void)
tipc_core_stop_net();
tipc_core_stop();
info("Deactivated\n");
- tipc_log_resize(0);
}
module_init(tipc_init);
@@ -244,8 +241,6 @@ MODULE_VERSION(TIPC_MOD_VER);
EXPORT_SYMBOL(tipc_attach);
EXPORT_SYMBOL(tipc_detach);
-EXPORT_SYMBOL(tipc_get_addr);
-EXPORT_SYMBOL(tipc_get_mode);
EXPORT_SYMBOL(tipc_createport);
EXPORT_SYMBOL(tipc_deleteport);
EXPORT_SYMBOL(tipc_ownidentity);
@@ -260,23 +255,10 @@ EXPORT_SYMBOL(tipc_withdraw);
EXPORT_SYMBOL(tipc_connect2port);
EXPORT_SYMBOL(tipc_disconnect);
EXPORT_SYMBOL(tipc_shutdown);
-EXPORT_SYMBOL(tipc_isconnected);
-EXPORT_SYMBOL(tipc_peer);
-EXPORT_SYMBOL(tipc_ref_valid);
EXPORT_SYMBOL(tipc_send);
-EXPORT_SYMBOL(tipc_send_buf);
EXPORT_SYMBOL(tipc_send2name);
-EXPORT_SYMBOL(tipc_forward2name);
-EXPORT_SYMBOL(tipc_send_buf2name);
-EXPORT_SYMBOL(tipc_forward_buf2name);
EXPORT_SYMBOL(tipc_send2port);
-EXPORT_SYMBOL(tipc_forward2port);
-EXPORT_SYMBOL(tipc_send_buf2port);
-EXPORT_SYMBOL(tipc_forward_buf2port);
EXPORT_SYMBOL(tipc_multicast);
-/* EXPORT_SYMBOL(tipc_multicast_buf); not available yet */
-EXPORT_SYMBOL(tipc_ispublished);
-EXPORT_SYMBOL(tipc_available_nodes);
/* TIPC API for external bearers (see tipc_bearer.h) */
@@ -293,6 +275,4 @@ EXPORT_SYMBOL(tipc_createport_raw);
EXPORT_SYMBOL(tipc_reject_msg);
EXPORT_SYMBOL(tipc_send_buf_fast);
EXPORT_SYMBOL(tipc_acknowledge);
-EXPORT_SYMBOL(tipc_get_port);
-EXPORT_SYMBOL(tipc_get_handle);
diff --git a/net/tipc/core.h b/net/tipc/core.h
index 188799017ab..e19389e5722 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -83,9 +83,7 @@
* Note: TIPC_LOG is configured to echo its output to the system console;
* user-defined buffers can be configured to do the same thing.
*/
-
extern struct print_buf *const TIPC_NULL;
-extern struct print_buf *const TIPC_CONS;
extern struct print_buf *const TIPC_LOG;
void tipc_printf(struct print_buf *, const char *fmt, ...);
@@ -204,10 +202,7 @@ extern atomic_t tipc_user_count;
* Routines available to privileged subsystems
*/
-extern int tipc_core_start(void);
-extern void tipc_core_stop(void);
-extern int tipc_core_start_net(unsigned long addr);
-extern void tipc_core_stop_net(void);
+extern int tipc_core_start_net(unsigned long);
extern int tipc_handler_start(void);
extern void tipc_handler_stop(void);
extern int tipc_netlink_start(void);
@@ -328,7 +323,7 @@ static inline struct tipc_msg *buf_msg(struct sk_buff *skb)
return (struct tipc_msg *)skb->data;
}
-extern struct sk_buff *buf_acquire(u32 size);
+extern struct sk_buff *tipc_buf_acquire(u32 size);
/**
* buf_discard - frees a TIPC message buffer
diff --git a/net/tipc/dbg.c b/net/tipc/dbg.c
index 1885a7edb0c..46f51d208e5 100644
--- a/net/tipc/dbg.c
+++ b/net/tipc/dbg.c
@@ -52,7 +52,7 @@ static struct print_buf null_buf = { NULL, 0, NULL, 0 };
struct print_buf *const TIPC_NULL = &null_buf;
static struct print_buf cons_buf = { NULL, 0, NULL, 1 };
-struct print_buf *const TIPC_CONS = &cons_buf;
+static struct print_buf *const TIPC_CONS = &cons_buf;
static struct print_buf log_buf = { NULL, 0, NULL, 1 };
struct print_buf *const TIPC_LOG = &log_buf;
@@ -76,6 +76,10 @@ struct print_buf *const TIPC_LOG = &log_buf;
static char print_string[TIPC_PB_MAX_STR];
static DEFINE_SPINLOCK(print_lock);
+static void tipc_printbuf_reset(struct print_buf *pb);
+static int tipc_printbuf_empty(struct print_buf *pb);
+static void tipc_printbuf_move(struct print_buf *pb_to,
+ struct print_buf *pb_from);
#define FORMAT(PTR,LEN,FMT) \
{\
@@ -116,7 +120,7 @@ void tipc_printbuf_init(struct print_buf *pb, char *raw, u32 size)
* @pb: pointer to print buffer structure
*/
-void tipc_printbuf_reset(struct print_buf *pb)
+static void tipc_printbuf_reset(struct print_buf *pb)
{
if (pb->buf) {
pb->crs = pb->buf;
@@ -132,9 +136,9 @@ void tipc_printbuf_reset(struct print_buf *pb)
* Returns non-zero if print buffer is empty.
*/
-int tipc_printbuf_empty(struct print_buf *pb)
+static int tipc_printbuf_empty(struct print_buf *pb)
{
- return (!pb->buf || (pb->crs == pb->buf));
+ return !pb->buf || (pb->crs == pb->buf);
}
/**
@@ -169,7 +173,7 @@ int tipc_printbuf_validate(struct print_buf *pb)
tipc_printf(pb, err);
}
}
- return (pb->crs - pb->buf + 1);
+ return pb->crs - pb->buf + 1;
}
/**
@@ -181,7 +185,8 @@ int tipc_printbuf_validate(struct print_buf *pb)
* Source print buffer becomes empty if a successful move occurs.
*/
-void tipc_printbuf_move(struct print_buf *pb_to, struct print_buf *pb_from)
+static void tipc_printbuf_move(struct print_buf *pb_to,
+ struct print_buf *pb_from)
{
int len;
diff --git a/net/tipc/dbg.h b/net/tipc/dbg.h
index 5ef1bc8f64e..3ba6ba8b434 100644
--- a/net/tipc/dbg.h
+++ b/net/tipc/dbg.h
@@ -56,10 +56,7 @@ struct print_buf {
#define TIPC_PB_MAX_STR 512 /* max printable string (with trailing NUL) */
void tipc_printbuf_init(struct print_buf *pb, char *buf, u32 size);
-void tipc_printbuf_reset(struct print_buf *pb);
-int tipc_printbuf_empty(struct print_buf *pb);
int tipc_printbuf_validate(struct print_buf *pb);
-void tipc_printbuf_move(struct print_buf *pb_to, struct print_buf *pb_from);
int tipc_log_resize(int log_size);
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index fc1fcf5e6b5..4a7cd3719b7 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -46,16 +46,6 @@
#define TIPC_LINK_REQ_FAST 2000 /* normal delay if bearer has no links */
#define TIPC_LINK_REQ_SLOW 600000 /* normal delay if bearer has links */
-#if 0
-#define GET_NODE_INFO 300
-#define GET_NODE_INFO_RESULT 301
-#define FORWARD_LINK_PROBE 302
-#define LINK_REQUEST_REJECTED 303
-#define LINK_REQUEST_ACCEPTED 304
-#define DROP_LINK_REQUEST 305
-#define CHECK_LINK_COUNT 306
-#endif
-
/*
* TODO: Most of the inter-cluster setup stuff should be
* rewritten, and be made conformant with specification.
@@ -78,30 +68,6 @@ struct link_req {
unsigned int timer_intv;
};
-
-#if 0
-int disc_create_link(const struct tipc_link_create *argv)
-{
- /*
- * Code for inter cluster link setup here
- */
- return TIPC_OK;
-}
-#endif
-
-/*
- * disc_lost_link(): A link has lost contact
- */
-
-void tipc_disc_link_event(u32 addr, char *name, int up)
-{
- if (in_own_cluster(addr))
- return;
- /*
- * Code for inter cluster link setup here
- */
-}
-
/**
* tipc_disc_init_msg - initialize a link setup message
* @type: message type (request or response)
@@ -115,7 +81,7 @@ static struct sk_buff *tipc_disc_init_msg(u32 type,
u32 dest_domain,
struct bearer *b_ptr)
{
- struct sk_buff *buf = buf_acquire(DSC_H_SIZE);
+ struct sk_buff *buf = tipc_buf_acquire(DSC_H_SIZE);
struct tipc_msg *msg;
if (buf) {
@@ -203,6 +169,14 @@ void tipc_disc_recv_msg(struct sk_buff *buf, struct bearer *b_ptr)
return;
}
spin_lock_bh(&n_ptr->lock);
+
+ /* Don't talk to neighbor during cleanup after last session */
+
+ if (n_ptr->cleanup_required) {
+ spin_unlock_bh(&n_ptr->lock);
+ return;
+ }
+
link = n_ptr->links[b_ptr->identity];
if (!link) {
dbg("creating link\n");
diff --git a/net/tipc/discover.h b/net/tipc/discover.h
index c36eaeb7d5d..f8e75063612 100644
--- a/net/tipc/discover.h
+++ b/net/tipc/discover.h
@@ -50,9 +50,4 @@ void tipc_disc_stop_link_req(struct link_req *req);
void tipc_disc_recv_msg(struct sk_buff *buf, struct bearer *b_ptr);
-void tipc_disc_link_event(u32 addr, char *name, int up);
-#if 0
-int disc_create_link(const struct tipc_link_create *argv);
-#endif
-
#endif
diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c
index 6230d16020c..6e988ba485f 100644
--- a/net/tipc/eth_media.c
+++ b/net/tipc/eth_media.c
@@ -72,17 +72,26 @@ static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr,
{
struct sk_buff *clone;
struct net_device *dev;
+ int delta;
clone = skb_clone(buf, GFP_ATOMIC);
- if (clone) {
- skb_reset_network_header(clone);
- dev = ((struct eth_bearer *)(tb_ptr->usr_handle))->dev;
- clone->dev = dev;
- dev_hard_header(clone, dev, ETH_P_TIPC,
- &dest->dev_addr.eth_addr,
- dev->dev_addr, clone->len);
- dev_queue_xmit(clone);
+ if (!clone)
+ return 0;
+
+ dev = ((struct eth_bearer *)(tb_ptr->usr_handle))->dev;
+ delta = dev->hard_header_len - skb_headroom(buf);
+
+ if ((delta > 0) &&
+ pskb_expand_head(clone, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
+ kfree_skb(clone);
+ return 0;
}
+
+ skb_reset_network_header(clone);
+ clone->dev = dev;
+ dev_hard_header(clone, dev, ETH_P_TIPC, &dest->dev_addr.eth_addr,
+ dev->dev_addr, clone->len);
+ dev_queue_xmit(clone);
return 0;
}
@@ -92,15 +101,12 @@ static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr,
* Accept only packets explicitly sent to this node, or broadcast packets;
* ignores packets sent using Ethernet multicast, and traffic sent to other
* nodes (which can happen if interface is running in promiscuous mode).
- * Routine truncates any Ethernet padding/CRC appended to the message,
- * and ensures message size matches actual length
*/
static int recv_msg(struct sk_buff *buf, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
struct eth_bearer *eb_ptr = (struct eth_bearer *)pt->af_packet_priv;
- u32 size;
if (!net_eq(dev_net(dev), &init_net)) {
kfree_skb(buf);
@@ -109,13 +115,9 @@ static int recv_msg(struct sk_buff *buf, struct net_device *dev,
if (likely(eb_ptr->bearer)) {
if (likely(buf->pkt_type <= PACKET_BROADCAST)) {
- size = msg_size((struct tipc_msg *)buf->data);
- skb_trim(buf, size);
- if (likely(buf->len == size)) {
- buf->next = NULL;
- tipc_recv_msg(buf, eb_ptr->bearer);
- return 0;
- }
+ buf->next = NULL;
+ tipc_recv_msg(buf, eb_ptr->bearer);
+ return 0;
}
}
kfree_skb(buf);
@@ -133,6 +135,16 @@ static int enable_bearer(struct tipc_bearer *tb_ptr)
struct eth_bearer *eb_ptr = &eth_bearers[0];
struct eth_bearer *stop = &eth_bearers[MAX_ETH_BEARERS];
char *driver_name = strchr((const char *)tb_ptr->name, ':') + 1;
+ int pending_dev = 0;
+
+ /* Find unused Ethernet bearer structure */
+
+ while (eb_ptr->dev) {
+ if (!eb_ptr->bearer)
+ pending_dev++;
+ if (++eb_ptr == stop)
+ return pending_dev ? -EAGAIN : -EDQUOT;
+ }
/* Find device with specified name */
diff --git a/net/tipc/link.c b/net/tipc/link.c
index a3616b99529..b31992ccd5d 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -99,23 +99,6 @@ struct link_name {
char if_peer[TIPC_MAX_IF_NAME];
};
-#if 0
-
-/* LINK EVENT CODE IS NOT SUPPORTED AT PRESENT */
-
-/**
- * struct link_event - link up/down event notification
- */
-
-struct link_event {
- u32 addr;
- int up;
- void (*fcn)(u32, char *, int);
- char name[TIPC_MAX_LINK_NAME];
-};
-
-#endif
-
static void link_handle_out_of_seq_msg(struct link *l_ptr,
struct sk_buff *buf);
static void link_recv_proto_msg(struct link *l_ptr, struct sk_buff *buf);
@@ -129,6 +112,9 @@ static void link_state_event(struct link *l_ptr, u32 event);
static void link_reset_statistics(struct link *l_ptr);
static void link_print(struct link *l_ptr, struct print_buf *buf,
const char *str);
+static void link_start(struct link *l_ptr);
+static int link_send_long_buf(struct link *l_ptr, struct sk_buff *buf);
+
/*
* Debugging code used by link routines only
@@ -239,13 +225,13 @@ int tipc_link_is_up(struct link *l_ptr)
{
if (!l_ptr)
return 0;
- return (link_working_working(l_ptr) || link_working_unknown(l_ptr));
+ return link_working_working(l_ptr) || link_working_unknown(l_ptr);
}
int tipc_link_is_active(struct link *l_ptr)
{
- return ((l_ptr->owner->active_links[0] == l_ptr) ||
- (l_ptr->owner->active_links[1] == l_ptr));
+ return (l_ptr->owner->active_links[0] == l_ptr) ||
+ (l_ptr->owner->active_links[1] == l_ptr);
}
/**
@@ -459,7 +445,7 @@ struct link *tipc_link_create(struct bearer *b_ptr, const u32 peer,
k_init_timer(&l_ptr->timer, (Handler)link_timeout, (unsigned long)l_ptr);
list_add_tail(&l_ptr->link_list, &b_ptr->links);
- tipc_k_signal((Handler)tipc_link_start, (unsigned long)l_ptr);
+ tipc_k_signal((Handler)link_start, (unsigned long)l_ptr);
dbg("tipc_link_create(): tolerance = %u,cont intv = %u, abort_limit = %u\n",
l_ptr->tolerance, l_ptr->continuity_interval, l_ptr->abort_limit);
@@ -499,9 +485,9 @@ void tipc_link_delete(struct link *l_ptr)
kfree(l_ptr);
}
-void tipc_link_start(struct link *l_ptr)
+static void link_start(struct link *l_ptr)
{
- dbg("tipc_link_start %x\n", l_ptr);
+ dbg("link_start %x\n", l_ptr);
link_state_event(l_ptr, STARTING_EVT);
}
@@ -634,39 +620,9 @@ void tipc_link_stop(struct link *l_ptr)
l_ptr->proto_msg_queue = NULL;
}
-#if 0
-
/* LINK EVENT CODE IS NOT SUPPORTED AT PRESENT */
-
-static void link_recv_event(struct link_event *ev)
-{
- ev->fcn(ev->addr, ev->name, ev->up);
- kfree(ev);
-}
-
-static void link_send_event(void (*fcn)(u32 a, char *n, int up),
- struct link *l_ptr, int up)
-{
- struct link_event *ev;
-
- ev = kmalloc(sizeof(*ev), GFP_ATOMIC);
- if (!ev) {
- warn("Link event allocation failure\n");
- return;
- }
- ev->addr = l_ptr->addr;
- ev->up = up;
- ev->fcn = fcn;
- memcpy(ev->name, l_ptr->name, TIPC_MAX_LINK_NAME);
- tipc_k_signal((Handler)link_recv_event, (unsigned long)ev);
-}
-
-#else
-
#define link_send_event(fcn, l_ptr, up) do { } while (0)
-#endif
-
void tipc_link_reset(struct link *l_ptr)
{
struct sk_buff *buf;
@@ -690,10 +646,7 @@ void tipc_link_reset(struct link *l_ptr)
tipc_node_link_down(l_ptr->owner, l_ptr);
tipc_bearer_remove_dest(l_ptr->b_ptr, l_ptr->addr);
-#if 0
- tipc_printf(TIPC_CONS, "\nReset link <%s>\n", l_ptr->name);
- dbg_link_dump();
-#endif
+
if (was_active_link && tipc_node_has_active_links(l_ptr->owner) &&
l_ptr->owner->permit_changeover) {
l_ptr->reset_checkpoint = checkpoint;
@@ -1050,7 +1003,7 @@ int tipc_link_send_buf(struct link *l_ptr, struct sk_buff *buf)
/* Fragmentation needed ? */
if (size > max_packet)
- return tipc_link_send_long_buf(l_ptr, buf);
+ return link_send_long_buf(l_ptr, buf);
/* Packet can be queued or sent: */
@@ -1086,7 +1039,7 @@ int tipc_link_send_buf(struct link *l_ptr, struct sk_buff *buf)
/* Try creating a new bundle */
if (size <= max_packet * 2 / 3) {
- struct sk_buff *bundler = buf_acquire(max_packet);
+ struct sk_buff *bundler = tipc_buf_acquire(max_packet);
struct tipc_msg bundler_hdr;
if (bundler) {
@@ -1362,7 +1315,7 @@ again:
/* Prepare header of first fragment: */
- buf_chain = buf = buf_acquire(max_pkt);
+ buf_chain = buf = tipc_buf_acquire(max_pkt);
if (!buf)
return -ENOMEM;
buf->next = NULL;
@@ -1419,7 +1372,7 @@ error:
msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE);
msg_set_fragm_no(&fragm_hdr, ++fragm_no);
prev = buf;
- buf = buf_acquire(fragm_sz + INT_H_SIZE);
+ buf = tipc_buf_acquire(fragm_sz + INT_H_SIZE);
if (!buf)
goto error;
@@ -1802,6 +1755,15 @@ static int link_recv_buf_validate(struct sk_buff *buf)
return pskb_may_pull(buf, hdr_size);
}
+/**
+ * tipc_recv_msg - process TIPC messages arriving from off-node
+ * @head: pointer to message buffer chain
+ * @tb_ptr: pointer to bearer message arrived on
+ *
+ * Invoked with no locks held. Bearer pointer must point to a valid bearer
+ * structure (i.e. cannot be NULL), but bearer can be inactive.
+ */
+
void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr)
{
read_lock_bh(&tipc_net_lock);
@@ -1819,6 +1781,11 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr)
head = head->next;
+ /* Ensure bearer is still enabled */
+
+ if (unlikely(!b_ptr->active))
+ goto cont;
+
/* Ensure message is well-formed */
if (unlikely(!link_recv_buf_validate(buf)))
@@ -1855,13 +1822,22 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr)
goto cont;
}
- /* Locate unicast link endpoint that should handle message */
+ /* Locate neighboring node that sent message */
n_ptr = tipc_node_find(msg_prevnode(msg));
if (unlikely(!n_ptr))
goto cont;
tipc_node_lock(n_ptr);
+ /* Don't talk to neighbor during cleanup after last session */
+
+ if (n_ptr->cleanup_required) {
+ tipc_node_unlock(n_ptr);
+ goto cont;
+ }
+
+ /* Locate unicast link endpoint that should handle message */
+
l_ptr = n_ptr->links[b_ptr->identity];
if (unlikely(!l_ptr)) {
tipc_node_unlock(n_ptr);
@@ -2172,7 +2148,7 @@ void tipc_link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int probe_msg,
if (tipc_bearer_congested(l_ptr->b_ptr, l_ptr)) {
if (!l_ptr->proto_msg_queue) {
l_ptr->proto_msg_queue =
- buf_acquire(sizeof(l_ptr->proto_msg));
+ tipc_buf_acquire(sizeof(l_ptr->proto_msg));
}
buf = l_ptr->proto_msg_queue;
if (!buf)
@@ -2186,7 +2162,7 @@ void tipc_link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int probe_msg,
msg_dbg(msg, ">>");
- buf = buf_acquire(msg_size);
+ buf = tipc_buf_acquire(msg_size);
if (!buf)
return;
@@ -2345,10 +2321,10 @@ exit:
* tipc_link_tunnel(): Send one message via a link belonging to
* another bearer. Owner node is locked.
*/
-void tipc_link_tunnel(struct link *l_ptr,
- struct tipc_msg *tunnel_hdr,
- struct tipc_msg *msg,
- u32 selector)
+static void tipc_link_tunnel(struct link *l_ptr,
+ struct tipc_msg *tunnel_hdr,
+ struct tipc_msg *msg,
+ u32 selector)
{
struct link *tunnel;
struct sk_buff *buf;
@@ -2361,7 +2337,7 @@ void tipc_link_tunnel(struct link *l_ptr,
return;
}
msg_set_size(tunnel_hdr, length + INT_H_SIZE);
- buf = buf_acquire(length + INT_H_SIZE);
+ buf = tipc_buf_acquire(length + INT_H_SIZE);
if (!buf) {
warn("Link changeover error, "
"unable to send tunnel msg\n");
@@ -2407,7 +2383,7 @@ void tipc_link_changeover(struct link *l_ptr)
if (!l_ptr->first_out) {
struct sk_buff *buf;
- buf = buf_acquire(INT_H_SIZE);
+ buf = tipc_buf_acquire(INT_H_SIZE);
if (buf) {
skb_copy_to_linear_data(buf, &tunnel_hdr, INT_H_SIZE);
msg_set_size(&tunnel_hdr, INT_H_SIZE);
@@ -2468,7 +2444,7 @@ void tipc_link_send_duplicate(struct link *l_ptr, struct link *tunnel)
msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); /* Update */
msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in);
msg_set_size(&tunnel_hdr, length + INT_H_SIZE);
- outbuf = buf_acquire(length + INT_H_SIZE);
+ outbuf = tipc_buf_acquire(length + INT_H_SIZE);
if (outbuf == NULL) {
warn("Link changeover error, "
"unable to send duplicate msg\n");
@@ -2504,7 +2480,7 @@ static struct sk_buff *buf_extract(struct sk_buff *skb, u32 from_pos)
u32 size = msg_size(msg);
struct sk_buff *eb;
- eb = buf_acquire(size);
+ eb = tipc_buf_acquire(size);
if (eb)
skb_copy_to_linear_data(eb, msg, size);
return eb;
@@ -2632,11 +2608,11 @@ void tipc_link_recv_bundle(struct sk_buff *buf)
/*
- * tipc_link_send_long_buf: Entry for buffers needing fragmentation.
+ * link_send_long_buf: Entry for buffers needing fragmentation.
* The buffer is complete, inclusive total message length.
* Returns user data length.
*/
-int tipc_link_send_long_buf(struct link *l_ptr, struct sk_buff *buf)
+static int link_send_long_buf(struct link *l_ptr, struct sk_buff *buf)
{
struct tipc_msg *inmsg = buf_msg(buf);
struct tipc_msg fragm_hdr;
@@ -2675,7 +2651,7 @@ int tipc_link_send_long_buf(struct link *l_ptr, struct sk_buff *buf)
fragm_sz = rest;
msg_set_type(&fragm_hdr, LAST_FRAGMENT);
}
- fragm = buf_acquire(fragm_sz + INT_H_SIZE);
+ fragm = tipc_buf_acquire(fragm_sz + INT_H_SIZE);
if (fragm == NULL) {
warn("Link unable to fragment message\n");
dsz = -ENOMEM;
@@ -2780,7 +2756,7 @@ int tipc_link_recv_fragment(struct sk_buff **pending, struct sk_buff **fb,
buf_discard(fbuf);
return 0;
}
- pbuf = buf_acquire(msg_size(imsg));
+ pbuf = tipc_buf_acquire(msg_size(imsg));
if (pbuf != NULL) {
pbuf->next = *pending;
*pending = pbuf;
@@ -3174,44 +3150,6 @@ struct sk_buff *tipc_link_cmd_show_stats(const void *req_tlv_area, int req_tlv_s
return buf;
}
-#if 0
-int link_control(const char *name, u32 op, u32 val)
-{
- int res = -EINVAL;
- struct link *l_ptr;
- u32 bearer_id;
- struct tipc_node * node;
- u32 a;
-
- a = link_name2addr(name, &bearer_id);
- read_lock_bh(&tipc_net_lock);
- node = tipc_node_find(a);
- if (node) {
- tipc_node_lock(node);
- l_ptr = node->links[bearer_id];
- if (l_ptr) {
- if (op == TIPC_REMOVE_LINK) {
- struct bearer *b_ptr = l_ptr->b_ptr;
- spin_lock_bh(&b_ptr->publ.lock);
- tipc_link_delete(l_ptr);
- spin_unlock_bh(&b_ptr->publ.lock);
- }
- if (op == TIPC_CMD_BLOCK_LINK) {
- tipc_link_reset(l_ptr);
- l_ptr->blocked = 1;
- }
- if (op == TIPC_CMD_UNBLOCK_LINK) {
- l_ptr->blocked = 0;
- }
- res = 0;
- }
- tipc_node_unlock(node);
- }
- read_unlock_bh(&tipc_net_lock);
- return res;
-}
-#endif
-
/**
* tipc_link_get_max_pkt - get maximum packet size to use when sending to destination
* @dest: network address of destination node
@@ -3242,28 +3180,6 @@ u32 tipc_link_get_max_pkt(u32 dest, u32 selector)
return res;
}
-#if 0
-static void link_dump_rec_queue(struct link *l_ptr)
-{
- struct sk_buff *crs;
-
- if (!l_ptr->oldest_deferred_in) {
- info("Reception queue empty\n");
- return;
- }
- info("Contents of Reception queue:\n");
- crs = l_ptr->oldest_deferred_in;
- while (crs) {
- if (crs->data == (void *)0x0000a3a3) {
- info("buffer %x invalid\n", crs);
- return;
- }
- msg_dbg(buf_msg(crs), "In rec queue:\n");
- crs = crs->next;
- }
-}
-#endif
-
static void link_dump_send_queue(struct link *l_ptr)
{
if (l_ptr->next_out) {
diff --git a/net/tipc/link.h b/net/tipc/link.h
index 2e5385c47d3..f98bc613de6 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -210,10 +210,6 @@ struct link {
u32 msg_length_counts;
u32 msg_lengths_total;
u32 msg_length_profile[7];
-#if 0
- u32 sent_tunneled;
- u32 recv_tunneled;
-#endif
} stats;
struct print_buf print_buf;
@@ -229,7 +225,6 @@ void tipc_link_send_duplicate(struct link *l_ptr, struct link *dest);
void tipc_link_reset_fragments(struct link *l_ptr);
int tipc_link_is_up(struct link *l_ptr);
int tipc_link_is_active(struct link *l_ptr);
-void tipc_link_start(struct link *l_ptr);
u32 tipc_link_push_packet(struct link *l_ptr);
void tipc_link_stop(struct link *l_ptr);
struct sk_buff *tipc_link_cmd_config(const void *req_tlv_area, int req_tlv_space, u16 cmd);
@@ -243,9 +238,6 @@ int tipc_link_send_sections_fast(struct port* sender,
struct iovec const *msg_sect,
const u32 num_sect,
u32 destnode);
-int tipc_link_send_long_buf(struct link *l_ptr, struct sk_buff *buf);
-void tipc_link_tunnel(struct link *l_ptr, struct tipc_msg *tnl_hdr,
- struct tipc_msg *msg, u32 selector);
void tipc_link_recv_bundle(struct sk_buff *buf);
int tipc_link_recv_fragment(struct sk_buff **pending,
struct sk_buff **fb,
@@ -279,12 +271,12 @@ static inline int between(u32 lower, u32 upper, u32 n)
static inline int less_eq(u32 left, u32 right)
{
- return (mod(right - left) < 32768u);
+ return mod(right - left) < 32768u;
}
static inline int less(u32 left, u32 right)
{
- return (less_eq(left, right) && (mod(right) != mod(left)));
+ return less_eq(left, right) && (mod(right) != mod(left));
}
static inline u32 lesser(u32 left, u32 right)
@@ -299,32 +291,32 @@ static inline u32 lesser(u32 left, u32 right)
static inline int link_working_working(struct link *l_ptr)
{
- return (l_ptr->state == WORKING_WORKING);
+ return l_ptr->state == WORKING_WORKING;
}
static inline int link_working_unknown(struct link *l_ptr)
{
- return (l_ptr->state == WORKING_UNKNOWN);
+ return l_ptr->state == WORKING_UNKNOWN;
}
static inline int link_reset_unknown(struct link *l_ptr)
{
- return (l_ptr->state == RESET_UNKNOWN);
+ return l_ptr->state == RESET_UNKNOWN;
}
static inline int link_reset_reset(struct link *l_ptr)
{
- return (l_ptr->state == RESET_RESET);
+ return l_ptr->state == RESET_RESET;
}
static inline int link_blocked(struct link *l_ptr)
{
- return (l_ptr->exp_msg_count || l_ptr->blocked);
+ return l_ptr->exp_msg_count || l_ptr->blocked;
}
static inline int link_congested(struct link *l_ptr)
{
- return (l_ptr->out_queue_size >= l_ptr->queue_limit[0]);
+ return l_ptr->out_queue_size >= l_ptr->queue_limit[0];
}
#endif
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 381063817b4..ecb532fb035 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -112,7 +112,7 @@ int tipc_msg_build(struct tipc_msg *hdr,
return dsz;
}
- *buf = buf_acquire(sz);
+ *buf = tipc_buf_acquire(sz);
if (!(*buf))
return -ENOMEM;
skb_copy_to_linear_data(*buf, hdr, hsz);
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 995d2da35b0..031aad18efc 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -104,7 +104,7 @@ static inline u32 msg_user(struct tipc_msg *m)
static inline u32 msg_isdata(struct tipc_msg *m)
{
- return (msg_user(m) <= TIPC_CRITICAL_IMPORTANCE);
+ return msg_user(m) <= TIPC_CRITICAL_IMPORTANCE;
}
static inline void msg_set_user(struct tipc_msg *m, u32 n)
@@ -289,7 +289,7 @@ static inline void msg_set_destnode(struct tipc_msg *m, u32 a)
static inline int msg_is_dest(struct tipc_msg *m, u32 d)
{
- return(msg_short(m) || (msg_destnode(m) == d));
+ return msg_short(m) || (msg_destnode(m) == d);
}
static inline u32 msg_routed(struct tipc_msg *m)
@@ -632,7 +632,7 @@ static inline void msg_set_bcast_tag(struct tipc_msg *m, u32 n)
static inline u32 msg_max_pkt(struct tipc_msg *m)
{
- return (msg_bits(m, 9, 16, 0xffff) * 4);
+ return msg_bits(m, 9, 16, 0xffff) * 4;
}
static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n)
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 6ac3c543250..7b907171f87 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -98,7 +98,7 @@ static void publ_to_item(struct distr_item *i, struct publication *p)
static struct sk_buff *named_prepare_buf(u32 type, u32 size, u32 dest)
{
- struct sk_buff *buf = buf_acquire(LONG_H_SIZE + size);
+ struct sk_buff *buf = tipc_buf_acquire(LONG_H_SIZE + size);
struct tipc_msg *msg;
if (buf != NULL) {
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 8ba79620db3..3a8de4334da 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -116,7 +116,7 @@ DEFINE_RWLOCK(tipc_nametbl_lock);
static int hash(int x)
{
- return(x & (tipc_nametbl_size - 1));
+ return x & (tipc_nametbl_size - 1);
}
/**
@@ -613,8 +613,7 @@ struct publication *tipc_nametbl_remove_publ(u32 type, u32 lower,
}
/*
- * tipc_nametbl_translate(): Translate tipc_name -> tipc_portid.
- * Very time-critical.
+ * tipc_nametbl_translate - translate name to port id
*
* Note: on entry 'destnode' is the search domain used during translation;
* on exit it passes back the node address of the matching port (if any)
@@ -685,7 +684,6 @@ found:
}
spin_unlock_bh(&seq->lock);
not_found:
- *destnode = 0;
read_unlock_bh(&tipc_nametbl_lock);
return 0;
}
@@ -877,7 +875,7 @@ static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth,
u32 index)
{
char portIdStr[27];
- char *scopeStr;
+ const char *scope_str[] = {"", " zone", " cluster", " node"};
struct publication *publ = sseq->zone_list;
tipc_printf(buf, "%-10u %-10u ", sseq->lower, sseq->upper);
@@ -893,15 +891,8 @@ static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth,
tipc_node(publ->node), publ->ref);
tipc_printf(buf, "%-26s ", portIdStr);
if (depth > 3) {
- if (publ->node != tipc_own_addr)
- scopeStr = "";
- else if (publ->scope == TIPC_NODE_SCOPE)
- scopeStr = "node";
- else if (publ->scope == TIPC_CLUSTER_SCOPE)
- scopeStr = "cluster";
- else
- scopeStr = "zone";
- tipc_printf(buf, "%-10u %s", publ->key, scopeStr);
+ tipc_printf(buf, "%-10u %s", publ->key,
+ scope_str[publ->scope]);
}
publ = publ->zone_list_next;
@@ -951,24 +942,19 @@ static void nameseq_list(struct name_seq *seq, struct print_buf *buf, u32 depth,
static void nametbl_header(struct print_buf *buf, u32 depth)
{
- tipc_printf(buf, "Type ");
-
- if (depth > 1)
- tipc_printf(buf, "Lower Upper ");
- if (depth > 2)
- tipc_printf(buf, "Port Identity ");
- if (depth > 3)
- tipc_printf(buf, "Publication");
-
- tipc_printf(buf, "\n-----------");
-
- if (depth > 1)
- tipc_printf(buf, "--------------------- ");
- if (depth > 2)
- tipc_printf(buf, "-------------------------- ");
- if (depth > 3)
- tipc_printf(buf, "------------------");
-
+ const char *header[] = {
+ "Type ",
+ "Lower Upper ",
+ "Port Identity ",
+ "Publication Scope"
+ };
+
+ int i;
+
+ if (depth > 4)
+ depth = 4;
+ for (i = 0; i < depth; i++)
+ tipc_printf(buf, header[i]);
tipc_printf(buf, "\n");
}
@@ -1023,16 +1009,6 @@ static void nametbl_list(struct print_buf *buf, u32 depth_info,
}
}
-#if 0
-void tipc_nametbl_print(struct print_buf *buf, const char *str)
-{
- tipc_printf(buf, str);
- read_lock_bh(&tipc_nametbl_lock);
- nametbl_list(buf, 0, 0, 0, 0);
- read_unlock_bh(&tipc_nametbl_lock);
-}
-#endif
-
#define MAX_NAME_TBL_QUERY 32768
struct sk_buff *tipc_nametbl_get(const void *req_tlv_area, int req_tlv_space)
@@ -1065,13 +1041,6 @@ struct sk_buff *tipc_nametbl_get(const void *req_tlv_area, int req_tlv_space)
return buf;
}
-#if 0
-void tipc_nametbl_dump(void)
-{
- nametbl_list(TIPC_CONS, 0, 0, 0, 0);
-}
-#endif
-
int tipc_nametbl_init(void)
{
table.types = kcalloc(tipc_nametbl_size, sizeof(struct hlist_head),
diff --git a/net/tipc/net.c b/net/tipc/net.c
index f61b7694138..1a621cfd660 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -129,15 +129,6 @@ u32 tipc_net_select_router(u32 addr, u32 ref)
return tipc_zone_select_router(tipc_net.zones[tipc_zone(addr)], addr, ref);
}
-#if 0
-u32 tipc_net_next_node(u32 a)
-{
- if (tipc_net.zones[tipc_zone(a)])
- return tipc_zone_next_node(a);
- return 0;
-}
-#endif
-
void tipc_net_remove_as_router(u32 router)
{
u32 z_num;
@@ -248,6 +239,7 @@ void tipc_net_route_msg(struct sk_buff *buf)
/* Handle message for another node */
msg_dbg(msg, "NET>SEND>: ");
+ skb_trim(buf, msg_size(msg));
tipc_link_send(buf, dnode, msg_link_selector(msg));
}
diff --git a/net/tipc/node.c b/net/tipc/node.c
index b634942caba..b4d87eb2dc5 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -50,7 +50,8 @@ void node_print(struct print_buf *buf, struct tipc_node *n_ptr, char *str);
static void node_lost_contact(struct tipc_node *n_ptr);
static void node_established_contact(struct tipc_node *n_ptr);
-struct tipc_node *tipc_nodes = NULL; /* sorted list of nodes within cluster */
+/* sorted list of nodes within cluster */
+static struct tipc_node *tipc_nodes = NULL;
static DEFINE_SPINLOCK(node_create_lock);
@@ -125,16 +126,6 @@ void tipc_node_delete(struct tipc_node *n_ptr)
if (!n_ptr)
return;
-#if 0
- /* Not needed because links are already deleted via tipc_bearer_stop() */
-
- u32 l_num;
-
- for (l_num = 0; l_num < MAX_BEARERS; l_num++) {
- link_delete(n_ptr->links[l_num]);
- }
-#endif
-
dbg("node %x deleted\n", n_ptr->addr);
kfree(n_ptr);
}
@@ -237,23 +228,22 @@ void tipc_node_link_down(struct tipc_node *n_ptr, struct link *l_ptr)
int tipc_node_has_active_links(struct tipc_node *n_ptr)
{
- return (n_ptr &&
- ((n_ptr->active_links[0]) || (n_ptr->active_links[1])));
+ return n_ptr->active_links[0] != NULL;
}
int tipc_node_has_redundant_links(struct tipc_node *n_ptr)
{
- return (n_ptr->working_links > 1);
+ return n_ptr->working_links > 1;
}
static int tipc_node_has_active_routes(struct tipc_node *n_ptr)
{
- return (n_ptr && (n_ptr->last_router >= 0));
+ return n_ptr && (n_ptr->last_router >= 0);
}
int tipc_node_is_up(struct tipc_node *n_ptr)
{
- return (tipc_node_has_active_links(n_ptr) || tipc_node_has_active_routes(n_ptr));
+ return tipc_node_has_active_links(n_ptr) || tipc_node_has_active_routes(n_ptr);
}
struct tipc_node *tipc_node_attach_link(struct link *l_ptr)
@@ -384,6 +374,20 @@ static void node_established_contact(struct tipc_node *n_ptr)
tipc_highest_allowed_slave);
}
+static void node_cleanup_finished(unsigned long node_addr)
+{
+ struct tipc_node *n_ptr;
+
+ read_lock_bh(&tipc_net_lock);
+ n_ptr = tipc_node_find(node_addr);
+ if (n_ptr) {
+ tipc_node_lock(n_ptr);
+ n_ptr->cleanup_required = 0;
+ tipc_node_unlock(n_ptr);
+ }
+ read_unlock_bh(&tipc_net_lock);
+}
+
static void node_lost_contact(struct tipc_node *n_ptr)
{
struct cluster *c_ptr;
@@ -458,6 +462,11 @@ static void node_lost_contact(struct tipc_node *n_ptr)
tipc_k_signal((Handler)ns->handle_node_down,
(unsigned long)ns->usr_handle);
}
+
+ /* Prevent re-contact with node until all cleanup is done */
+
+ n_ptr->cleanup_required = 1;
+ tipc_k_signal((Handler)node_cleanup_finished, n_ptr->addr);
}
/**
@@ -579,38 +588,6 @@ void tipc_node_remove_router(struct tipc_node *n_ptr, u32 router)
node_lost_contact(n_ptr);
}
-#if 0
-void node_print(struct print_buf *buf, struct tipc_node *n_ptr, char *str)
-{
- u32 i;
-
- tipc_printf(buf, "\n\n%s", str);
- for (i = 0; i < MAX_BEARERS; i++) {
- if (!n_ptr->links[i])
- continue;
- tipc_printf(buf, "Links[%u]: %x, ", i, n_ptr->links[i]);
- }
- tipc_printf(buf, "Active links: [%x,%x]\n",
- n_ptr->active_links[0], n_ptr->active_links[1]);
-}
-#endif
-
-u32 tipc_available_nodes(const u32 domain)
-{
- struct tipc_node *n_ptr;
- u32 cnt = 0;
-
- read_lock_bh(&tipc_net_lock);
- for (n_ptr = tipc_nodes; n_ptr; n_ptr = n_ptr->next) {
- if (!tipc_in_scope(domain, n_ptr->addr))
- continue;
- if (tipc_node_is_up(n_ptr))
- cnt++;
- }
- read_unlock_bh(&tipc_net_lock);
- return cnt;
-}
-
struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space)
{
u32 domain;
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 6f990da5d14..fff331b2d26 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -52,6 +52,7 @@
* @active_links: pointers to active links to node
* @links: pointers to all links to node
* @working_links: number of working links to node (both active and standby)
+ * @cleanup_required: non-zero if cleaning up after a prior loss of contact
* @link_cnt: number of links to node
* @permit_changeover: non-zero if node has redundant links to this system
* @routers: bitmap (used for multicluster communication)
@@ -78,6 +79,7 @@ struct tipc_node {
struct link *links[MAX_BEARERS];
int link_cnt;
int working_links;
+ int cleanup_required;
int permit_changeover;
u32 routers[512/32];
int last_router;
@@ -94,7 +96,6 @@ struct tipc_node {
} bclink;
};
-extern struct tipc_node *tipc_nodes;
extern u32 tipc_own_tag;
struct tipc_node *tipc_node_create(u32 addr);
diff --git a/net/tipc/port.c b/net/tipc/port.c
index 0737680e926..82092eaa153 100644
--- a/net/tipc/port.c
+++ b/net/tipc/port.c
@@ -293,34 +293,6 @@ int tipc_deleteport(u32 ref)
return 0;
}
-/**
- * tipc_get_port() - return port associated with 'ref'
- *
- * Note: Port is not locked.
- */
-
-struct tipc_port *tipc_get_port(const u32 ref)
-{
- return (struct tipc_port *)tipc_ref_deref(ref);
-}
-
-/**
- * tipc_get_handle - return user handle associated to port 'ref'
- */
-
-void *tipc_get_handle(const u32 ref)
-{
- struct port *p_ptr;
- void * handle;
-
- p_ptr = tipc_port_lock(ref);
- if (!p_ptr)
- return NULL;
- handle = p_ptr->publ.usr_handle;
- tipc_port_unlock(p_ptr);
- return handle;
-}
-
static int port_unreliable(struct port *p_ptr)
{
return msg_src_droppable(&p_ptr->publ.phdr);
@@ -392,7 +364,7 @@ static struct sk_buff *port_build_proto_msg(u32 destport, u32 destnode,
struct sk_buff *buf;
struct tipc_msg *msg;
- buf = buf_acquire(LONG_H_SIZE);
+ buf = tipc_buf_acquire(LONG_H_SIZE);
if (buf) {
msg = buf_msg(buf);
tipc_msg_init(msg, usr, type, LONG_H_SIZE, destnode);
@@ -433,7 +405,7 @@ int tipc_reject_msg(struct sk_buff *buf, u32 err)
hdr_sz = MCAST_H_SIZE;
else
hdr_sz = LONG_H_SIZE;
- rbuf = buf_acquire(data_sz + hdr_sz);
+ rbuf = tipc_buf_acquire(data_sz + hdr_sz);
if (rbuf == NULL) {
buf_discard(buf);
return data_sz;
@@ -588,19 +560,10 @@ void tipc_port_recv_proto_msg(struct sk_buff *buf)
if (!p_ptr) {
err = TIPC_ERR_NO_PORT;
} else if (p_ptr->publ.connected) {
- if (port_peernode(p_ptr) != msg_orignode(msg))
+ if ((port_peernode(p_ptr) != msg_orignode(msg)) ||
+ (port_peerport(p_ptr) != msg_origport(msg))) {
err = TIPC_ERR_NO_PORT;
- if (port_peerport(p_ptr) != msg_origport(msg))
- err = TIPC_ERR_NO_PORT;
- if (!err && msg_routed(msg)) {
- u32 seqno = msg_transp_seqno(msg);
- u32 myno = ++p_ptr->last_in_seqno;
- if (seqno != myno) {
- err = TIPC_ERR_NO_PORT;
- abort_buf = port_build_self_abort_msg(p_ptr, err);
- }
- }
- if (msg_type(msg) == CONN_ACK) {
+ } else if (msg_type(msg) == CONN_ACK) {
int wakeup = tipc_port_congested(p_ptr) &&
p_ptr->publ.congested &&
p_ptr->wakeup;
@@ -719,50 +682,6 @@ struct sk_buff *tipc_port_get_ports(void)
return buf;
}
-#if 0
-
-#define MAX_PORT_STATS 2000
-
-struct sk_buff *port_show_stats(const void *req_tlv_area, int req_tlv_space)
-{
- u32 ref;
- struct port *p_ptr;
- struct sk_buff *buf;
- struct tlv_desc *rep_tlv;
- struct print_buf pb;
- int str_len;
-
- if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_PORT_REF))
- return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
-
- ref = *(u32 *)TLV_DATA(req_tlv_area);
- ref = ntohl(ref);
-
- p_ptr = tipc_port_lock(ref);
- if (!p_ptr)
- return cfg_reply_error_string("port not found");
-
- buf = tipc_cfg_reply_alloc(TLV_SPACE(MAX_PORT_STATS));
- if (!buf) {
- tipc_port_unlock(p_ptr);
- return NULL;
- }
- rep_tlv = (struct tlv_desc *)buf->data;
-
- tipc_printbuf_init(&pb, TLV_DATA(rep_tlv), MAX_PORT_STATS);
- port_print(p_ptr, &pb, 1);
- /* NEED TO FILL IN ADDITIONAL PORT STATISTICS HERE */
- tipc_port_unlock(p_ptr);
- str_len = tipc_printbuf_validate(&pb);
-
- skb_put(buf, TLV_SPACE(str_len));
- TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len);
-
- return buf;
-}
-
-#endif
-
void tipc_port_reinit(void)
{
struct port *p_ptr;
@@ -1295,50 +1214,13 @@ int tipc_shutdown(u32 ref)
return tipc_disconnect(ref);
}
-int tipc_isconnected(u32 ref, int *isconnected)
-{
- struct port *p_ptr;
-
- p_ptr = tipc_port_lock(ref);
- if (!p_ptr)
- return -EINVAL;
- *isconnected = p_ptr->publ.connected;
- tipc_port_unlock(p_ptr);
- return 0;
-}
-
-int tipc_peer(u32 ref, struct tipc_portid *peer)
-{
- struct port *p_ptr;
- int res;
-
- p_ptr = tipc_port_lock(ref);
- if (!p_ptr)
- return -EINVAL;
- if (p_ptr->publ.connected) {
- peer->ref = port_peerport(p_ptr);
- peer->node = port_peernode(p_ptr);
- res = 0;
- } else
- res = -ENOTCONN;
- tipc_port_unlock(p_ptr);
- return res;
-}
-
-int tipc_ref_valid(u32 ref)
-{
- /* Works irrespective of type */
- return !!tipc_ref_deref(ref);
-}
-
-
/*
* tipc_port_recv_sections(): Concatenate and deliver sectioned
* message for this node.
*/
-int tipc_port_recv_sections(struct port *sender, unsigned int num_sect,
- struct iovec const *msg_sect)
+static int tipc_port_recv_sections(struct port *sender, unsigned int num_sect,
+ struct iovec const *msg_sect)
{
struct sk_buff *buf;
int res;
@@ -1389,65 +1271,16 @@ int tipc_send(u32 ref, unsigned int num_sect, struct iovec const *msg_sect)
}
/**
- * tipc_send_buf - send message buffer on connection
- */
-
-int tipc_send_buf(u32 ref, struct sk_buff *buf, unsigned int dsz)
-{
- struct port *p_ptr;
- struct tipc_msg *msg;
- u32 destnode;
- u32 hsz;
- u32 sz;
- u32 res;
-
- p_ptr = tipc_port_deref(ref);
- if (!p_ptr || !p_ptr->publ.connected)
- return -EINVAL;
-
- msg = &p_ptr->publ.phdr;
- hsz = msg_hdr_sz(msg);
- sz = hsz + dsz;
- msg_set_size(msg, sz);
- if (skb_cow(buf, hsz))
- return -ENOMEM;
-
- skb_push(buf, hsz);
- skb_copy_to_linear_data(buf, msg, hsz);
- destnode = msg_destnode(msg);
- p_ptr->publ.congested = 1;
- if (!tipc_port_congested(p_ptr)) {
- if (likely(destnode != tipc_own_addr))
- res = tipc_send_buf_fast(buf, destnode);
- else {
- tipc_port_recv_msg(buf);
- res = sz;
- }
- if (likely(res != -ELINKCONG)) {
- port_incr_out_seqno(p_ptr);
- p_ptr->sent++;
- p_ptr->publ.congested = 0;
- return res;
- }
- }
- if (port_unreliable(p_ptr)) {
- p_ptr->publ.congested = 0;
- return dsz;
- }
- return -ELINKCONG;
-}
-
-/**
* tipc_forward2name - forward message sections to port name
*/
-int tipc_forward2name(u32 ref,
- struct tipc_name const *name,
- u32 domain,
- u32 num_sect,
- struct iovec const *msg_sect,
- struct tipc_portid const *orig,
- unsigned int importance)
+static int tipc_forward2name(u32 ref,
+ struct tipc_name const *name,
+ u32 domain,
+ u32 num_sect,
+ struct iovec const *msg_sect,
+ struct tipc_portid const *orig,
+ unsigned int importance)
{
struct port *p_ptr;
struct tipc_msg *msg;
@@ -1473,7 +1306,7 @@ int tipc_forward2name(u32 ref,
msg_set_destnode(msg, destnode);
msg_set_destport(msg, destport);
- if (likely(destport || destnode)) {
+ if (likely(destport)) {
p_ptr->sent++;
if (likely(destnode == tipc_own_addr))
return tipc_port_recv_sections(p_ptr, num_sect, msg_sect);
@@ -1510,89 +1343,15 @@ int tipc_send2name(u32 ref,
}
/**
- * tipc_forward_buf2name - forward message buffer to port name
- */
-
-int tipc_forward_buf2name(u32 ref,
- struct tipc_name const *name,
- u32 domain,
- struct sk_buff *buf,
- unsigned int dsz,
- struct tipc_portid const *orig,
- unsigned int importance)
-{
- struct port *p_ptr;
- struct tipc_msg *msg;
- u32 destnode = domain;
- u32 destport;
- int res;
-
- p_ptr = (struct port *)tipc_ref_deref(ref);
- if (!p_ptr || p_ptr->publ.connected)
- return -EINVAL;
-
- msg = &p_ptr->publ.phdr;
- if (importance <= TIPC_CRITICAL_IMPORTANCE)
- msg_set_importance(msg, importance);
- msg_set_type(msg, TIPC_NAMED_MSG);
- msg_set_orignode(msg, orig->node);
- msg_set_origport(msg, orig->ref);
- msg_set_nametype(msg, name->type);
- msg_set_nameinst(msg, name->instance);
- msg_set_lookup_scope(msg, tipc_addr_scope(domain));
- msg_set_hdr_sz(msg, LONG_H_SIZE);
- msg_set_size(msg, LONG_H_SIZE + dsz);
- destport = tipc_nametbl_translate(name->type, name->instance, &destnode);
- msg_set_destnode(msg, destnode);
- msg_set_destport(msg, destport);
- msg_dbg(msg, "forw2name ==> ");
- if (skb_cow(buf, LONG_H_SIZE))
- return -ENOMEM;
- skb_push(buf, LONG_H_SIZE);
- skb_copy_to_linear_data(buf, msg, LONG_H_SIZE);
- msg_dbg(buf_msg(buf),"PREP:");
- if (likely(destport || destnode)) {
- p_ptr->sent++;
- if (destnode == tipc_own_addr)
- return tipc_port_recv_msg(buf);
- res = tipc_send_buf_fast(buf, destnode);
- if (likely(res != -ELINKCONG))
- return res;
- if (port_unreliable(p_ptr))
- return dsz;
- return -ELINKCONG;
- }
- return tipc_reject_msg(buf, TIPC_ERR_NO_NAME);
-}
-
-/**
- * tipc_send_buf2name - send message buffer to port name
- */
-
-int tipc_send_buf2name(u32 ref,
- struct tipc_name const *dest,
- u32 domain,
- struct sk_buff *buf,
- unsigned int dsz)
-{
- struct tipc_portid orig;
-
- orig.ref = ref;
- orig.node = tipc_own_addr;
- return tipc_forward_buf2name(ref, dest, domain, buf, dsz, &orig,
- TIPC_PORT_IMPORTANCE);
-}
-
-/**
* tipc_forward2port - forward message sections to port identity
*/
-int tipc_forward2port(u32 ref,
- struct tipc_portid const *dest,
- unsigned int num_sect,
- struct iovec const *msg_sect,
- struct tipc_portid const *orig,
- unsigned int importance)
+static int tipc_forward2port(u32 ref,
+ struct tipc_portid const *dest,
+ unsigned int num_sect,
+ struct iovec const *msg_sect,
+ struct tipc_portid const *orig,
+ unsigned int importance)
{
struct port *p_ptr;
struct tipc_msg *msg;
@@ -1644,12 +1403,12 @@ int tipc_send2port(u32 ref,
/**
* tipc_forward_buf2port - forward message buffer to port identity
*/
-int tipc_forward_buf2port(u32 ref,
- struct tipc_portid const *dest,
- struct sk_buff *buf,
- unsigned int dsz,
- struct tipc_portid const *orig,
- unsigned int importance)
+static int tipc_forward_buf2port(u32 ref,
+ struct tipc_portid const *dest,
+ struct sk_buff *buf,
+ unsigned int dsz,
+ struct tipc_portid const *orig,
+ unsigned int importance)
{
struct port *p_ptr;
struct tipc_msg *msg;
diff --git a/net/tipc/port.h b/net/tipc/port.h
index 8d1652aab29..73bbf442b34 100644
--- a/net/tipc/port.h
+++ b/net/tipc/port.h
@@ -109,8 +109,6 @@ struct port {
extern spinlock_t tipc_port_list_lock;
struct port_list;
-int tipc_port_recv_sections(struct port *p_ptr, u32 num_sect,
- struct iovec const *msg_sect);
int tipc_port_reject_sections(struct port *p_ptr, struct tipc_msg *hdr,
struct iovec const *msg_sect, u32 num_sect,
int err);
@@ -157,7 +155,7 @@ static inline u32 tipc_peer_node(struct port *p_ptr)
static inline int tipc_port_congested(struct port *p_ptr)
{
- return((p_ptr->sent - p_ptr->acked) >= (TIPC_FLOW_CONTROL_WIN * 2));
+ return (p_ptr->sent - p_ptr->acked) >= (TIPC_FLOW_CONTROL_WIN * 2);
}
/**
diff --git a/net/tipc/ref.c b/net/tipc/ref.c
index 8dea66500cf..ab8ad32d8c2 100644
--- a/net/tipc/ref.c
+++ b/net/tipc/ref.c
@@ -282,23 +282,6 @@ void *tipc_ref_lock(u32 ref)
return NULL;
}
-/**
- * tipc_ref_unlock - unlock referenced object
- */
-
-void tipc_ref_unlock(u32 ref)
-{
- if (likely(tipc_ref_table.entries)) {
- struct reference *entry;
-
- entry = &tipc_ref_table.entries[ref &
- tipc_ref_table.index_mask];
- if (likely((entry->ref == ref) && (entry->object)))
- spin_unlock_bh(&entry->lock);
- else
- err("Attempt to unlock non-existent reference\n");
- }
-}
/**
* tipc_ref_deref - return pointer referenced object (without locking it)
diff --git a/net/tipc/ref.h b/net/tipc/ref.h
index 7e3798ea93b..5bc8e7ab84d 100644
--- a/net/tipc/ref.h
+++ b/net/tipc/ref.h
@@ -44,7 +44,6 @@ u32 tipc_ref_acquire(void *object, spinlock_t **lock);
void tipc_ref_discard(u32 ref);
void *tipc_ref_lock(u32 ref);
-void tipc_ref_unlock(u32 ref);
void *tipc_ref_deref(u32 ref);
#endif
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 66e889ba48f..33217fc3d69 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -64,6 +64,7 @@ struct tipc_sock {
struct sock sk;
struct tipc_port *p;
struct tipc_portid peer_name;
+ long conn_timeout;
};
#define tipc_sk(sk) ((struct tipc_sock *)(sk))
@@ -240,9 +241,9 @@ static int tipc_create(struct net *net, struct socket *sock, int protocol,
sock->state = state;
sock_init_data(sock, sk);
- sk->sk_rcvtimeo = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT);
sk->sk_backlog_rcv = backlog_rcv;
tipc_sk(sk)->p = tp_ptr;
+ tipc_sk(sk)->conn_timeout = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT);
spin_unlock_bh(tp_ptr->lock);
@@ -429,36 +430,55 @@ static int get_name(struct socket *sock, struct sockaddr *uaddr,
* to handle any preventable race conditions, so TIPC will do the same ...
*
* TIPC sets the returned events as follows:
- * a) POLLRDNORM and POLLIN are set if the socket's receive queue is non-empty
- * or if a connection-oriented socket is does not have an active connection
- * (i.e. a read operation will not block).
- * b) POLLOUT is set except when a socket's connection has been terminated
- * (i.e. a write operation will not block).
- * c) POLLHUP is set when a socket's connection has been terminated.
- *
- * IMPORTANT: The fact that a read or write operation will not block does NOT
- * imply that the operation will succeed!
+ *
+ * socket state flags set
+ * ------------ ---------
+ * unconnected no read flags
+ * no write flags
+ *
+ * connecting POLLIN/POLLRDNORM if ACK/NACK in rx queue
+ * no write flags
+ *
+ * connected POLLIN/POLLRDNORM if data in rx queue
+ * POLLOUT if port is not congested
+ *
+ * disconnecting POLLIN/POLLRDNORM/POLLHUP
+ * no write flags
+ *
+ * listening POLLIN if SYN in rx queue
+ * no write flags
+ *
+ * ready POLLIN/POLLRDNORM if data in rx queue
+ * [connectionless] POLLOUT (since port cannot be congested)
+ *
+ * IMPORTANT: The fact that a read or write operation is indicated does NOT
+ * imply that the operation will succeed, merely that it should be performed
+ * and will not block.
*/
static unsigned int poll(struct file *file, struct socket *sock,
poll_table *wait)
{
struct sock *sk = sock->sk;
- u32 mask;
+ u32 mask = 0;
poll_wait(file, sk_sleep(sk), wait);
- if (!skb_queue_empty(&sk->sk_receive_queue) ||
- (sock->state == SS_UNCONNECTED) ||
- (sock->state == SS_DISCONNECTING))
- mask = (POLLRDNORM | POLLIN);
- else
- mask = 0;
-
- if (sock->state == SS_DISCONNECTING)
- mask |= POLLHUP;
- else
- mask |= POLLOUT;
+ switch ((int)sock->state) {
+ case SS_READY:
+ case SS_CONNECTED:
+ if (!tipc_sk_port(sk)->congested)
+ mask |= POLLOUT;
+ /* fall thru' */
+ case SS_CONNECTING:
+ case SS_LISTENING:
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ mask |= (POLLIN | POLLRDNORM);
+ break;
+ case SS_DISCONNECTING:
+ mask = (POLLIN | POLLRDNORM | POLLHUP);
+ break;
+ }
return mask;
}
@@ -1026,9 +1046,8 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock,
struct sk_buff *buf;
struct tipc_msg *msg;
unsigned int sz;
- int sz_to_copy;
+ int sz_to_copy, target, needed;
int sz_copied = 0;
- int needed;
char __user *crs = m->msg_iov->iov_base;
unsigned char *buf_crs;
u32 err;
@@ -1050,6 +1069,8 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock,
goto exit;
}
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, buf_len);
+
restart:
/* Look for a message in receive queue; wait if necessary */
@@ -1138,7 +1159,7 @@ restart:
if ((sz_copied < buf_len) && /* didn't get all requested data */
(!skb_queue_empty(&sk->sk_receive_queue) ||
- (flags & MSG_WAITALL)) && /* and more is ready or required */
+ (sz_copied < target)) && /* and more is ready or required */
(!(flags & MSG_PEEK)) && /* and aren't just peeking at data */
(!err)) /* and haven't reached a FIN */
goto restart;
@@ -1174,7 +1195,7 @@ static int rx_queue_full(struct tipc_msg *msg, u32 queue_size, u32 base)
if (msg_connected(msg))
threshold *= 4;
- return (queue_size >= threshold);
+ return queue_size >= threshold;
}
/**
@@ -1365,6 +1386,7 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen,
struct msghdr m = {NULL,};
struct sk_buff *buf;
struct tipc_msg *msg;
+ long timeout;
int res;
lock_sock(sk);
@@ -1379,7 +1401,7 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen,
/* For now, TIPC does not support the non-blocking form of connect() */
if (flags & O_NONBLOCK) {
- res = -EWOULDBLOCK;
+ res = -EOPNOTSUPP;
goto exit;
}
@@ -1425,11 +1447,12 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen,
/* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */
+ timeout = tipc_sk(sk)->conn_timeout;
release_sock(sk);
res = wait_event_interruptible_timeout(*sk_sleep(sk),
(!skb_queue_empty(&sk->sk_receive_queue) ||
(sock->state != SS_CONNECTING)),
- sk->sk_rcvtimeo);
+ timeout ? timeout : MAX_SCHEDULE_TIMEOUT);
lock_sock(sk);
if (res > 0) {
@@ -1692,7 +1715,7 @@ static int setsockopt(struct socket *sock,
res = tipc_set_portunreturnable(tport->ref, value);
break;
case TIPC_CONN_TIMEOUT:
- sk->sk_rcvtimeo = msecs_to_jiffies(value);
+ tipc_sk(sk)->conn_timeout = msecs_to_jiffies(value);
/* no need to set "res", since already 0 at this point */
break;
default:
@@ -1747,7 +1770,7 @@ static int getsockopt(struct socket *sock,
res = tipc_portunreturnable(tport->ref, &value);
break;
case TIPC_CONN_TIMEOUT:
- value = jiffies_to_msecs(sk->sk_rcvtimeo);
+ value = jiffies_to_msecs(tipc_sk(sk)->conn_timeout);
/* no need to set "res", since already 0 at this point */
break;
case TIPC_NODE_RECVQ_DEPTH:
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index ab6eab4c45e..33313961d01 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -76,6 +76,19 @@ struct top_srv {
static struct top_srv topsrv = { 0 };
/**
+ * htohl - convert value to endianness used by destination
+ * @in: value to convert
+ * @swap: non-zero if endianness must be reversed
+ *
+ * Returns converted value
+ */
+
+static u32 htohl(u32 in, int swap)
+{
+ return swap ? swab32(in) : in;
+}
+
+/**
* subscr_send_event - send a message containing a tipc_event to the subscriber
*
* Note: Must not hold subscriber's server port lock, since tipc_send() will
@@ -94,11 +107,11 @@ static void subscr_send_event(struct subscription *sub,
msg_sect.iov_base = (void *)&sub->evt;
msg_sect.iov_len = sizeof(struct tipc_event);
- sub->evt.event = htonl(event);
- sub->evt.found_lower = htonl(found_lower);
- sub->evt.found_upper = htonl(found_upper);
- sub->evt.port.ref = htonl(port_ref);
- sub->evt.port.node = htonl(node);
+ sub->evt.event = htohl(event, sub->swap);
+ sub->evt.found_lower = htohl(found_lower, sub->swap);
+ sub->evt.found_upper = htohl(found_upper, sub->swap);
+ sub->evt.port.ref = htohl(port_ref, sub->swap);
+ sub->evt.port.node = htohl(node, sub->swap);
tipc_send(sub->server_ref, 1, &msg_sect);
}
@@ -274,29 +287,16 @@ static void subscr_cancel(struct tipc_subscr *s,
{
struct subscription *sub;
struct subscription *sub_temp;
- __u32 type, lower, upper, timeout, filter;
int found = 0;
/* Find first matching subscription, exit if not found */
- type = ntohl(s->seq.type);
- lower = ntohl(s->seq.lower);
- upper = ntohl(s->seq.upper);
- timeout = ntohl(s->timeout);
- filter = ntohl(s->filter) & ~TIPC_SUB_CANCEL;
-
list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list,
subscription_list) {
- if ((type == sub->seq.type) &&
- (lower == sub->seq.lower) &&
- (upper == sub->seq.upper) &&
- (timeout == sub->timeout) &&
- (filter == sub->filter) &&
- !memcmp(s->usr_handle,sub->evt.s.usr_handle,
- sizeof(s->usr_handle)) ){
- found = 1;
- break;
- }
+ if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) {
+ found = 1;
+ break;
+ }
}
if (!found)
return;
@@ -310,7 +310,7 @@ static void subscr_cancel(struct tipc_subscr *s,
k_term_timer(&sub->timer);
spin_lock_bh(subscriber->lock);
}
- dbg("Cancel: removing sub %u,%u,%u from subscriber %p list\n",
+ dbg("Cancel: removing sub %u,%u,%u from subscriber %x list\n",
sub->seq.type, sub->seq.lower, sub->seq.upper, subscriber);
subscr_del(sub);
}
@@ -325,10 +325,16 @@ static struct subscription *subscr_subscribe(struct tipc_subscr *s,
struct subscriber *subscriber)
{
struct subscription *sub;
+ int swap;
+
+ /* Determine subscriber's endianness */
+
+ swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE));
/* Detect & process a subscription cancellation request */
- if (ntohl(s->filter) & TIPC_SUB_CANCEL) {
+ if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) {
+ s->filter &= ~htohl(TIPC_SUB_CANCEL, swap);
subscr_cancel(s, subscriber);
return NULL;
}
@@ -353,12 +359,13 @@ static struct subscription *subscr_subscribe(struct tipc_subscr *s,
/* Initialize subscription object */
- sub->seq.type = ntohl(s->seq.type);
- sub->seq.lower = ntohl(s->seq.lower);
- sub->seq.upper = ntohl(s->seq.upper);
- sub->timeout = ntohl(s->timeout);
- sub->filter = ntohl(s->filter);
- if ((sub->filter && (sub->filter != TIPC_SUB_PORTS)) ||
+ sub->seq.type = htohl(s->seq.type, swap);
+ sub->seq.lower = htohl(s->seq.lower, swap);
+ sub->seq.upper = htohl(s->seq.upper, swap);
+ sub->timeout = htohl(s->timeout, swap);
+ sub->filter = htohl(s->filter, swap);
+ if ((!(sub->filter & TIPC_SUB_PORTS) ==
+ !(sub->filter & TIPC_SUB_SERVICE)) ||
(sub->seq.lower > sub->seq.upper)) {
warn("Subscription rejected, illegal request\n");
kfree(sub);
@@ -369,6 +376,7 @@ static struct subscription *subscr_subscribe(struct tipc_subscr *s,
INIT_LIST_HEAD(&sub->nameseq_list);
list_add(&sub->subscription_list, &subscriber->subscription_list);
sub->server_ref = subscriber->port_ref;
+ sub->swap = swap;
memcpy(&sub->evt.s, s, sizeof(struct tipc_subscr));
atomic_inc(&topsrv.subscription_count);
if (sub->timeout != TIPC_WAIT_FOREVER) {
@@ -598,12 +606,3 @@ void tipc_subscr_stop(void)
topsrv.user_ref = 0;
}
}
-
-
-int tipc_ispublished(struct tipc_name const *name)
-{
- u32 domain = 0;
-
- return(tipc_nametbl_translate(name->type, name->instance,&domain) != 0);
-}
-
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index c20f496d95b..45d89bf4d20 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -53,6 +53,7 @@ typedef void (*tipc_subscr_event) (struct subscription *sub,
* @nameseq_list: adjacent subscriptions in name sequence's subscription list
* @subscription_list: adjacent subscriptions in subscriber's subscription list
* @server_ref: object reference of server port associated with subscription
+ * @swap: indicates if subscriber uses opposite endianness in its messages
* @evt: template for events generated by subscription
*/
@@ -65,6 +66,7 @@ struct subscription {
struct list_head nameseq_list;
struct list_head subscription_list;
u32 server_ref;
+ int swap;
struct tipc_event evt;
};
diff --git a/net/tipc/zone.c b/net/tipc/zone.c
index 2c01ba2d86b..83f8b5e91fc 100644
--- a/net/tipc/zone.c
+++ b/net/tipc/zone.c
@@ -160,14 +160,3 @@ u32 tipc_zone_select_router(struct _zone *z_ptr, u32 addr, u32 ref)
}
return 0;
}
-
-
-u32 tipc_zone_next_node(u32 addr)
-{
- struct cluster *c_ptr = tipc_cltr_find(addr);
-
- if (c_ptr)
- return tipc_cltr_next_node(c_ptr, addr);
- return 0;
-}
-
diff --git a/net/tipc/zone.h b/net/tipc/zone.h
index 7bdc3406ba9..bd1c20ce9d0 100644
--- a/net/tipc/zone.h
+++ b/net/tipc/zone.h
@@ -61,7 +61,6 @@ void tipc_zone_send_external_routes(struct _zone *z_ptr, u32 dest);
struct _zone *tipc_zone_create(u32 addr);
void tipc_zone_delete(struct _zone *z_ptr);
void tipc_zone_attach_cluster(struct _zone *z_ptr, struct cluster *c_ptr);
-u32 tipc_zone_next_node(u32 addr);
static inline struct _zone *tipc_zone_find(u32 addr)
{
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 4414a18c63b..0ebc777a666 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -692,6 +692,7 @@ static int unix_autobind(struct socket *sock)
static u32 ordernum = 1;
struct unix_address *addr;
int err;
+ unsigned int retries = 0;
mutex_lock(&u->readlock);
@@ -717,9 +718,17 @@ retry:
if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
addr->hash)) {
spin_unlock(&unix_table_lock);
- /* Sanity yield. It is unusual case, but yet... */
- if (!(ordernum&0xFF))
- yield();
+ /*
+ * __unix_find_socket_byname() may take long time if many names
+ * are already in use.
+ */
+ cond_resched();
+ /* Give up if all names seems to be in use. */
+ if (retries++ == 0xFFFFF) {
+ err = -ENOSPC;
+ kfree(addr);
+ goto out;
+ }
goto retry;
}
addr->hash ^= sk->sk_type;
@@ -1502,6 +1511,8 @@ restart:
goto restart;
}
+ if (sock_flag(other, SOCK_RCVTSTAMP))
+ __net_timestamp(skb);
skb_queue_tail(&other->sk_receive_queue, skb);
unix_state_unlock(other);
other->sk_data_ready(other, len);
@@ -1713,6 +1724,9 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
if (err)
goto out_free;
+ if (sock_flag(sk, SOCK_RCVTSTAMP))
+ __sock_recv_timestamp(msg, sk, skb);
+
if (!siocb->scm) {
siocb->scm = &tmp_scm;
memset(&tmp_scm, 0, sizeof(tmp_scm));
@@ -2024,11 +2038,10 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table
if (sk->sk_shutdown == SHUTDOWN_MASK)
mask |= POLLHUP;
if (sk->sk_shutdown & RCV_SHUTDOWN)
- mask |= POLLRDHUP;
+ mask |= POLLRDHUP | POLLIN | POLLRDNORM;
/* readable? */
- if (!skb_queue_empty(&sk->sk_receive_queue) ||
- (sk->sk_shutdown & RCV_SHUTDOWN))
+ if (!skb_queue_empty(&sk->sk_receive_queue))
mask |= POLLIN | POLLRDNORM;
/* Connection-based need to check for termination and startup */
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 541e2fff5e9..9c21ebf9780 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -253,11 +253,16 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
WARN_ON(err);
wdev->netdev->features |= NETIF_F_NETNS_LOCAL;
}
+
+ return err;
}
wiphy_net_set(&rdev->wiphy, net);
- return err;
+ err = device_rename(&rdev->wiphy.dev, dev_name(&rdev->wiphy.dev));
+ WARN_ON(err);
+
+ return 0;
}
static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data)
@@ -428,7 +433,7 @@ int wiphy_register(struct wiphy *wiphy)
/* sanity check ifmodes */
WARN_ON(!ifmodes);
- ifmodes &= ((1 << __NL80211_IFTYPE_AFTER_LAST) - 1) & ~1;
+ ifmodes &= ((1 << NUM_NL80211_IFTYPES) - 1) & ~1;
if (WARN_ON(ifmodes != wiphy->interface_modes))
wiphy->interface_modes = ifmodes;
@@ -475,12 +480,10 @@ int wiphy_register(struct wiphy *wiphy)
mutex_lock(&cfg80211_mutex);
res = device_add(&rdev->wiphy.dev);
- if (res)
- goto out_unlock;
-
- res = rfkill_register(rdev->rfkill);
- if (res)
- goto out_rm_dev;
+ if (res) {
+ mutex_unlock(&cfg80211_mutex);
+ return res;
+ }
/* set up regulatory info */
wiphy_update_regulatory(wiphy, NL80211_REGDOM_SET_BY_CORE);
@@ -509,13 +512,18 @@ int wiphy_register(struct wiphy *wiphy)
cfg80211_debugfs_rdev_add(rdev);
mutex_unlock(&cfg80211_mutex);
+ /*
+ * due to a locking dependency this has to be outside of the
+ * cfg80211_mutex lock
+ */
+ res = rfkill_register(rdev->rfkill);
+ if (res)
+ goto out_rm_dev;
+
return 0;
out_rm_dev:
device_del(&rdev->wiphy.dev);
-
-out_unlock:
- mutex_unlock(&cfg80211_mutex);
return res;
}
EXPORT_SYMBOL(wiphy_register);
@@ -680,8 +688,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb,
INIT_WORK(&wdev->cleanup_work, wdev_cleanup_work);
INIT_LIST_HEAD(&wdev->event_list);
spin_lock_init(&wdev->event_lock);
- INIT_LIST_HEAD(&wdev->action_registrations);
- spin_lock_init(&wdev->action_registrations_lock);
+ INIT_LIST_HEAD(&wdev->mgmt_registrations);
+ spin_lock_init(&wdev->mgmt_registrations_lock);
mutex_lock(&rdev->devlist_mtx);
list_add_rcu(&wdev->list, &rdev->netdev_list);
@@ -721,6 +729,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb,
dev->ethtool_ops = &cfg80211_ethtool_ops;
if ((wdev->iftype == NL80211_IFTYPE_STATION ||
+ wdev->iftype == NL80211_IFTYPE_P2P_CLIENT ||
wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr)
dev->priv_flags |= IFF_DONT_BRIDGE;
break;
@@ -729,6 +738,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb,
case NL80211_IFTYPE_ADHOC:
cfg80211_leave_ibss(rdev, dev, true);
break;
+ case NL80211_IFTYPE_P2P_CLIENT:
case NL80211_IFTYPE_STATION:
wdev_lock(wdev);
#ifdef CONFIG_CFG80211_WEXT
@@ -801,7 +811,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb,
sysfs_remove_link(&dev->dev.kobj, "phy80211");
list_del_rcu(&wdev->list);
rdev->devlist_generation++;
- cfg80211_mlme_purge_actions(wdev);
+ cfg80211_mlme_purge_registrations(wdev);
#ifdef CONFIG_CFG80211_WEXT
kfree(wdev->wext.keys);
#endif
@@ -907,52 +917,3 @@ static void __exit cfg80211_exit(void)
destroy_workqueue(cfg80211_wq);
}
module_exit(cfg80211_exit);
-
-static int ___wiphy_printk(const char *level, const struct wiphy *wiphy,
- struct va_format *vaf)
-{
- if (!wiphy)
- return printk("%s(NULL wiphy *): %pV", level, vaf);
-
- return printk("%s%s: %pV", level, wiphy_name(wiphy), vaf);
-}
-
-int __wiphy_printk(const char *level, const struct wiphy *wiphy,
- const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
- int r;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- r = ___wiphy_printk(level, wiphy, &vaf);
- va_end(args);
-
- return r;
-}
-EXPORT_SYMBOL(__wiphy_printk);
-
-#define define_wiphy_printk_level(func, kern_level) \
-int func(const struct wiphy *wiphy, const char *fmt, ...) \
-{ \
- struct va_format vaf; \
- va_list args; \
- int r; \
- \
- va_start(args, fmt); \
- \
- vaf.fmt = fmt; \
- vaf.va = &args; \
- \
- r = ___wiphy_printk(kern_level, wiphy, &vaf); \
- va_end(args); \
- \
- return r; \
-} \
-EXPORT_SYMBOL(func);
-
-define_wiphy_printk_level(wiphy_debug, KERN_DEBUG);
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 63d57ae399c..6583cca0e2e 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -86,7 +86,7 @@ struct cfg80211_registered_device *wiphy_to_dev(struct wiphy *wiphy)
static inline
bool wiphy_idx_valid(int wiphy_idx)
{
- return (wiphy_idx >= 0);
+ return wiphy_idx >= 0;
}
@@ -95,7 +95,10 @@ extern struct mutex cfg80211_mutex;
extern struct list_head cfg80211_rdev_list;
extern int cfg80211_rdev_list_generation;
-#define assert_cfg80211_lock() WARN_ON(!mutex_is_locked(&cfg80211_mutex))
+static inline void assert_cfg80211_lock(void)
+{
+ lockdep_assert_held(&cfg80211_mutex);
+}
/*
* You can use this to mark a wiphy_idx as not having an associated wiphy.
@@ -202,8 +205,8 @@ static inline void wdev_unlock(struct wireless_dev *wdev)
mutex_unlock(&wdev->mtx);
}
-#define ASSERT_RDEV_LOCK(rdev) WARN_ON(!mutex_is_locked(&(rdev)->mtx));
-#define ASSERT_WDEV_LOCK(wdev) WARN_ON(!mutex_is_locked(&(wdev)->mtx));
+#define ASSERT_RDEV_LOCK(rdev) lockdep_assert_held(&(rdev)->mtx)
+#define ASSERT_WDEV_LOCK(wdev) lockdep_assert_held(&(wdev)->mtx)
enum cfg80211_event_type {
EVENT_CONNECT_RESULT,
@@ -331,16 +334,17 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
const u8 *resp_ie, size_t resp_ie_len,
u16 status, bool wextev,
struct cfg80211_bss *bss);
-int cfg80211_mlme_register_action(struct wireless_dev *wdev, u32 snd_pid,
- const u8 *match_data, int match_len);
-void cfg80211_mlme_unregister_actions(struct wireless_dev *wdev, u32 nlpid);
-void cfg80211_mlme_purge_actions(struct wireless_dev *wdev);
-int cfg80211_mlme_action(struct cfg80211_registered_device *rdev,
- struct net_device *dev,
- struct ieee80211_channel *chan,
- enum nl80211_channel_type channel_type,
- bool channel_type_valid,
- const u8 *buf, size_t len, u64 *cookie);
+int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid,
+ u16 frame_type, const u8 *match_data,
+ int match_len);
+void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid);
+void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev);
+int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct ieee80211_channel *chan,
+ enum nl80211_channel_type channel_type,
+ bool channel_type_valid,
+ const u8 *buf, size_t len, u64 *cookie);
/* SME */
int __cfg80211_connect(struct cfg80211_registered_device *rdev,
@@ -371,7 +375,7 @@ bool cfg80211_sme_failed_reassoc(struct wireless_dev *wdev);
/* internal helpers */
int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
struct key_params *params, int key_idx,
- const u8 *mac_addr);
+ bool pairwise, const u8 *mac_addr);
void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
size_t ie_len, u16 reason, bool from_ap);
void cfg80211_sme_scan_done(struct net_device *dev);
diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c
index a4991a3efec..39765bcfb47 100644
--- a/net/wireless/debugfs.c
+++ b/net/wireless/debugfs.c
@@ -34,6 +34,7 @@ static ssize_t name## _read(struct file *file, char __user *userbuf, \
static const struct file_operations name## _ops = { \
.read = name## _read, \
.open = cfg80211_open_file_generic, \
+ .llseek = generic_file_llseek, \
};
DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d",
@@ -102,6 +103,7 @@ static ssize_t ht40allow_map_read(struct file *file,
static const struct file_operations ht40allow_map_ops = {
.read = ht40allow_map_read,
.open = cfg80211_open_file_generic,
+ .llseek = default_llseek,
};
#define DEBUGFS_ADD(name) \
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index 27a8ce9343c..f33fbb79437 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -88,6 +88,25 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
if (wdev->ssid_len)
return -EALREADY;
+ if (!params->basic_rates) {
+ /*
+ * If no rates were explicitly configured,
+ * use the mandatory rate set for 11b or
+ * 11a for maximum compatibility.
+ */
+ struct ieee80211_supported_band *sband =
+ rdev->wiphy.bands[params->channel->band];
+ int j;
+ u32 flag = params->channel->band == IEEE80211_BAND_5GHZ ?
+ IEEE80211_RATE_MANDATORY_A :
+ IEEE80211_RATE_MANDATORY_B;
+
+ for (j = 0; j < sband->n_bitrates; j++) {
+ if (sband->bitrates[j].flags & flag)
+ params->basic_rates |= BIT(j);
+ }
+ }
+
if (WARN_ON(wdev->connect_keys))
kfree(wdev->connect_keys);
wdev->connect_keys = connkeys;
@@ -141,7 +160,7 @@ static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext)
*/
if (rdev->ops->del_key)
for (i = 0; i < 6; i++)
- rdev->ops->del_key(wdev->wiphy, dev, i, NULL);
+ rdev->ops->del_key(wdev->wiphy, dev, i, false, NULL);
if (wdev->current_bss) {
cfg80211_unhold_bss(wdev->current_bss);
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index e74a1a2119d..26838d903b9 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -149,7 +149,7 @@ void __cfg80211_send_deauth(struct net_device *dev,
struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
const u8 *bssid = mgmt->bssid;
int i;
- bool found = false;
+ bool found = false, was_current = false;
ASSERT_WDEV_LOCK(wdev);
@@ -159,6 +159,7 @@ void __cfg80211_send_deauth(struct net_device *dev,
cfg80211_put_bss(&wdev->current_bss->pub);
wdev->current_bss = NULL;
found = true;
+ was_current = true;
} else for (i = 0; i < MAX_AUTH_BSSES; i++) {
if (wdev->auth_bsses[i] &&
memcmp(wdev->auth_bsses[i]->pub.bssid, bssid, ETH_ALEN) == 0) {
@@ -183,7 +184,7 @@ void __cfg80211_send_deauth(struct net_device *dev,
nl80211_send_deauth(rdev, dev, buf, len, GFP_KERNEL);
- if (wdev->sme_state == CFG80211_SME_CONNECTED) {
+ if (wdev->sme_state == CFG80211_SME_CONNECTED && was_current) {
u16 reason_code;
bool from_ap;
@@ -747,31 +748,53 @@ void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr,
}
EXPORT_SYMBOL(cfg80211_new_sta);
-struct cfg80211_action_registration {
+struct cfg80211_mgmt_registration {
struct list_head list;
u32 nlpid;
int match_len;
+ __le16 frame_type;
+
u8 match[];
};
-int cfg80211_mlme_register_action(struct wireless_dev *wdev, u32 snd_pid,
- const u8 *match_data, int match_len)
+int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid,
+ u16 frame_type, const u8 *match_data,
+ int match_len)
{
- struct cfg80211_action_registration *reg, *nreg;
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+ struct cfg80211_mgmt_registration *reg, *nreg;
int err = 0;
+ u16 mgmt_type;
+
+ if (!wdev->wiphy->mgmt_stypes)
+ return -EOPNOTSUPP;
+
+ if ((frame_type & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT)
+ return -EINVAL;
+
+ if (frame_type & ~(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE))
+ return -EINVAL;
+
+ mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4;
+ if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].rx & BIT(mgmt_type)))
+ return -EINVAL;
nreg = kzalloc(sizeof(*reg) + match_len, GFP_KERNEL);
if (!nreg)
return -ENOMEM;
- spin_lock_bh(&wdev->action_registrations_lock);
+ spin_lock_bh(&wdev->mgmt_registrations_lock);
- list_for_each_entry(reg, &wdev->action_registrations, list) {
+ list_for_each_entry(reg, &wdev->mgmt_registrations, list) {
int mlen = min(match_len, reg->match_len);
+ if (frame_type != le16_to_cpu(reg->frame_type))
+ continue;
+
if (memcmp(reg->match, match_data, mlen) == 0) {
err = -EALREADY;
break;
@@ -786,134 +809,212 @@ int cfg80211_mlme_register_action(struct wireless_dev *wdev, u32 snd_pid,
memcpy(nreg->match, match_data, match_len);
nreg->match_len = match_len;
nreg->nlpid = snd_pid;
- list_add(&nreg->list, &wdev->action_registrations);
+ nreg->frame_type = cpu_to_le16(frame_type);
+ list_add(&nreg->list, &wdev->mgmt_registrations);
+
+ if (rdev->ops->mgmt_frame_register)
+ rdev->ops->mgmt_frame_register(wiphy, wdev->netdev,
+ frame_type, true);
out:
- spin_unlock_bh(&wdev->action_registrations_lock);
+ spin_unlock_bh(&wdev->mgmt_registrations_lock);
+
return err;
}
-void cfg80211_mlme_unregister_actions(struct wireless_dev *wdev, u32 nlpid)
+void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid)
{
- struct cfg80211_action_registration *reg, *tmp;
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+ struct cfg80211_mgmt_registration *reg, *tmp;
- spin_lock_bh(&wdev->action_registrations_lock);
+ spin_lock_bh(&wdev->mgmt_registrations_lock);
- list_for_each_entry_safe(reg, tmp, &wdev->action_registrations, list) {
- if (reg->nlpid == nlpid) {
- list_del(&reg->list);
- kfree(reg);
+ list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
+ if (reg->nlpid != nlpid)
+ continue;
+
+ if (rdev->ops->mgmt_frame_register) {
+ u16 frame_type = le16_to_cpu(reg->frame_type);
+
+ rdev->ops->mgmt_frame_register(wiphy, wdev->netdev,
+ frame_type, false);
}
+
+ list_del(&reg->list);
+ kfree(reg);
}
- spin_unlock_bh(&wdev->action_registrations_lock);
+ spin_unlock_bh(&wdev->mgmt_registrations_lock);
}
-void cfg80211_mlme_purge_actions(struct wireless_dev *wdev)
+void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev)
{
- struct cfg80211_action_registration *reg, *tmp;
+ struct cfg80211_mgmt_registration *reg, *tmp;
- spin_lock_bh(&wdev->action_registrations_lock);
+ spin_lock_bh(&wdev->mgmt_registrations_lock);
- list_for_each_entry_safe(reg, tmp, &wdev->action_registrations, list) {
+ list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
list_del(&reg->list);
kfree(reg);
}
- spin_unlock_bh(&wdev->action_registrations_lock);
+ spin_unlock_bh(&wdev->mgmt_registrations_lock);
}
-int cfg80211_mlme_action(struct cfg80211_registered_device *rdev,
- struct net_device *dev,
- struct ieee80211_channel *chan,
- enum nl80211_channel_type channel_type,
- bool channel_type_valid,
- const u8 *buf, size_t len, u64 *cookie)
+int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct ieee80211_channel *chan,
+ enum nl80211_channel_type channel_type,
+ bool channel_type_valid,
+ const u8 *buf, size_t len, u64 *cookie)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
const struct ieee80211_mgmt *mgmt;
+ u16 stype;
- if (rdev->ops->action == NULL)
+ if (!wdev->wiphy->mgmt_stypes)
return -EOPNOTSUPP;
+
+ if (!rdev->ops->mgmt_tx)
+ return -EOPNOTSUPP;
+
if (len < 24 + 1)
return -EINVAL;
mgmt = (const struct ieee80211_mgmt *) buf;
- if (!ieee80211_is_action(mgmt->frame_control))
+
+ if (!ieee80211_is_mgmt(mgmt->frame_control))
return -EINVAL;
- if (mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) {
- /* Verify that we are associated with the destination AP */
- if (!wdev->current_bss ||
- memcmp(wdev->current_bss->pub.bssid, mgmt->bssid,
- ETH_ALEN) != 0 ||
- (wdev->iftype == NL80211_IFTYPE_STATION &&
- memcmp(wdev->current_bss->pub.bssid, mgmt->da,
- ETH_ALEN) != 0))
- return -ENOTCONN;
+
+ stype = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE;
+ if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].tx & BIT(stype >> 4)))
+ return -EINVAL;
+
+ if (ieee80211_is_action(mgmt->frame_control) &&
+ mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) {
+ int err = 0;
+
+ wdev_lock(wdev);
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ if (!wdev->current_bss) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ if (memcmp(wdev->current_bss->pub.bssid,
+ mgmt->bssid, ETH_ALEN)) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ /*
+ * check for IBSS DA must be done by driver as
+ * cfg80211 doesn't track the stations
+ */
+ if (wdev->iftype == NL80211_IFTYPE_ADHOC)
+ break;
+
+ /* for station, check that DA is the AP */
+ if (memcmp(wdev->current_bss->pub.bssid,
+ mgmt->da, ETH_ALEN)) {
+ err = -ENOTCONN;
+ break;
+ }
+ break;
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_P2P_GO:
+ case NL80211_IFTYPE_AP_VLAN:
+ if (memcmp(mgmt->bssid, dev->dev_addr, ETH_ALEN))
+ err = -EINVAL;
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+ wdev_unlock(wdev);
+
+ if (err)
+ return err;
}
if (memcmp(mgmt->sa, dev->dev_addr, ETH_ALEN) != 0)
return -EINVAL;
/* Transmit the Action frame as requested by user space */
- return rdev->ops->action(&rdev->wiphy, dev, chan, channel_type,
- channel_type_valid, buf, len, cookie);
+ return rdev->ops->mgmt_tx(&rdev->wiphy, dev, chan, channel_type,
+ channel_type_valid, buf, len, cookie);
}
-bool cfg80211_rx_action(struct net_device *dev, int freq, const u8 *buf,
- size_t len, gfp_t gfp)
+bool cfg80211_rx_mgmt(struct net_device *dev, int freq, const u8 *buf,
+ size_t len, gfp_t gfp)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct wiphy *wiphy = wdev->wiphy;
struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
- struct cfg80211_action_registration *reg;
- const u8 *action_data;
- int action_data_len;
+ struct cfg80211_mgmt_registration *reg;
+ const struct ieee80211_txrx_stypes *stypes =
+ &wiphy->mgmt_stypes[wdev->iftype];
+ struct ieee80211_mgmt *mgmt = (void *)buf;
+ const u8 *data;
+ int data_len;
bool result = false;
+ __le16 ftype = mgmt->frame_control &
+ cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE);
+ u16 stype;
- /* frame length - min size excluding category */
- action_data_len = len - (IEEE80211_MIN_ACTION_SIZE - 1);
+ stype = (le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE) >> 4;
- /* action data starts with category */
- action_data = buf + IEEE80211_MIN_ACTION_SIZE - 1;
+ if (!(stypes->rx & BIT(stype)))
+ return false;
- spin_lock_bh(&wdev->action_registrations_lock);
+ data = buf + ieee80211_hdrlen(mgmt->frame_control);
+ data_len = len - ieee80211_hdrlen(mgmt->frame_control);
+
+ spin_lock_bh(&wdev->mgmt_registrations_lock);
+
+ list_for_each_entry(reg, &wdev->mgmt_registrations, list) {
+ if (reg->frame_type != ftype)
+ continue;
- list_for_each_entry(reg, &wdev->action_registrations, list) {
- if (reg->match_len > action_data_len)
+ if (reg->match_len > data_len)
continue;
- if (memcmp(reg->match, action_data, reg->match_len))
+ if (memcmp(reg->match, data, reg->match_len))
continue;
/* found match! */
/* Indicate the received Action frame to user space */
- if (nl80211_send_action(rdev, dev, reg->nlpid, freq,
- buf, len, gfp))
+ if (nl80211_send_mgmt(rdev, dev, reg->nlpid, freq,
+ buf, len, gfp))
continue;
result = true;
break;
}
- spin_unlock_bh(&wdev->action_registrations_lock);
+ spin_unlock_bh(&wdev->mgmt_registrations_lock);
return result;
}
-EXPORT_SYMBOL(cfg80211_rx_action);
+EXPORT_SYMBOL(cfg80211_rx_mgmt);
-void cfg80211_action_tx_status(struct net_device *dev, u64 cookie,
- const u8 *buf, size_t len, bool ack, gfp_t gfp)
+void cfg80211_mgmt_tx_status(struct net_device *dev, u64 cookie,
+ const u8 *buf, size_t len, bool ack, gfp_t gfp)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct wiphy *wiphy = wdev->wiphy;
struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
/* Indicate TX status of the Action frame to user space */
- nl80211_send_action_tx_status(rdev, dev, cookie, buf, len, ack, gfp);
+ nl80211_send_mgmt_tx_status(rdev, dev, cookie, buf, len, ack, gfp);
}
-EXPORT_SYMBOL(cfg80211_action_tx_status);
+EXPORT_SYMBOL(cfg80211_mgmt_tx_status);
void cfg80211_cqm_rssi_notify(struct net_device *dev,
enum nl80211_cqm_rssi_threshold_event rssi_event,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 37902a54e9c..c506241f863 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -23,6 +23,11 @@
#include "nl80211.h"
#include "reg.h"
+static int nl80211_pre_doit(struct genl_ops *ops, struct sk_buff *skb,
+ struct genl_info *info);
+static void nl80211_post_doit(struct genl_ops *ops, struct sk_buff *skb,
+ struct genl_info *info);
+
/* the netlink family */
static struct genl_family nl80211_fam = {
.id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */
@@ -31,6 +36,8 @@ static struct genl_family nl80211_fam = {
.version = 1, /* no particular meaning now */
.maxattr = NL80211_ATTR_MAX,
.netnsok = true,
+ .pre_doit = nl80211_pre_doit,
+ .post_doit = nl80211_post_doit,
};
/* internal helper: get rdev and dev */
@@ -86,6 +93,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = {
[NL80211_ATTR_KEY_CIPHER] = { .type = NLA_U32 },
[NL80211_ATTR_KEY_DEFAULT] = { .type = NLA_FLAG },
[NL80211_ATTR_KEY_SEQ] = { .type = NLA_BINARY, .len = 8 },
+ [NL80211_ATTR_KEY_TYPE] = { .type = NLA_U32 },
[NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 },
[NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 },
@@ -136,6 +144,8 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = {
.len = sizeof(struct nl80211_sta_flag_update),
},
[NL80211_ATTR_CONTROL_PORT] = { .type = NLA_FLAG },
+ [NL80211_ATTR_CONTROL_PORT_ETHERTYPE] = { .type = NLA_U16 },
+ [NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG },
[NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG },
[NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 },
[NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 },
@@ -156,9 +166,10 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = {
[NL80211_ATTR_WIPHY_TX_POWER_SETTING] = { .type = NLA_U32 },
[NL80211_ATTR_WIPHY_TX_POWER_LEVEL] = { .type = NLA_U32 },
+ [NL80211_ATTR_FRAME_TYPE] = { .type = NLA_U16 },
};
-/* policy for the attributes */
+/* policy for the key attributes */
static const struct nla_policy nl80211_key_policy[NL80211_KEY_MAX + 1] = {
[NL80211_KEY_DATA] = { .type = NLA_BINARY, .len = WLAN_MAX_KEY_LEN },
[NL80211_KEY_IDX] = { .type = NLA_U8 },
@@ -166,6 +177,7 @@ static const struct nla_policy nl80211_key_policy[NL80211_KEY_MAX + 1] = {
[NL80211_KEY_SEQ] = { .type = NLA_BINARY, .len = 8 },
[NL80211_KEY_DEFAULT] = { .type = NLA_FLAG },
[NL80211_KEY_DEFAULT_MGMT] = { .type = NLA_FLAG },
+ [NL80211_KEY_TYPE] = { .type = NLA_U32 },
};
/* ifidx get helper */
@@ -188,6 +200,47 @@ static int nl80211_get_ifidx(struct netlink_callback *cb)
return res;
}
+static int nl80211_prepare_netdev_dump(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct cfg80211_registered_device **rdev,
+ struct net_device **dev)
+{
+ int ifidx = cb->args[0];
+ int err;
+
+ if (!ifidx)
+ ifidx = nl80211_get_ifidx(cb);
+ if (ifidx < 0)
+ return ifidx;
+
+ cb->args[0] = ifidx;
+
+ rtnl_lock();
+
+ *dev = __dev_get_by_index(sock_net(skb->sk), ifidx);
+ if (!*dev) {
+ err = -ENODEV;
+ goto out_rtnl;
+ }
+
+ *rdev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx);
+ if (IS_ERR(dev)) {
+ err = PTR_ERR(dev);
+ goto out_rtnl;
+ }
+
+ return 0;
+ out_rtnl:
+ rtnl_unlock();
+ return err;
+}
+
+static void nl80211_finish_netdev_dump(struct cfg80211_registered_device *rdev)
+{
+ cfg80211_unlock_rdev(rdev);
+ rtnl_unlock();
+}
+
/* IE validation */
static bool is_valid_ie_attr(const struct nlattr *attr)
{
@@ -255,6 +308,7 @@ static int nl80211_msg_put_channel(struct sk_buff *msg,
struct key_parse {
struct key_params p;
int idx;
+ int type;
bool def, defmgmt;
};
@@ -285,6 +339,12 @@ static int nl80211_parse_key_new(struct nlattr *key, struct key_parse *k)
if (tb[NL80211_KEY_CIPHER])
k->p.cipher = nla_get_u32(tb[NL80211_KEY_CIPHER]);
+ if (tb[NL80211_KEY_TYPE]) {
+ k->type = nla_get_u32(tb[NL80211_KEY_TYPE]);
+ if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES)
+ return -EINVAL;
+ }
+
return 0;
}
@@ -309,6 +369,12 @@ static int nl80211_parse_key_old(struct genl_info *info, struct key_parse *k)
k->def = !!info->attrs[NL80211_ATTR_KEY_DEFAULT];
k->defmgmt = !!info->attrs[NL80211_ATTR_KEY_DEFAULT_MGMT];
+ if (info->attrs[NL80211_ATTR_KEY_TYPE]) {
+ k->type = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]);
+ if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES)
+ return -EINVAL;
+ }
+
return 0;
}
@@ -318,6 +384,7 @@ static int nl80211_parse_key(struct genl_info *info, struct key_parse *k)
memset(k, 0, sizeof(*k));
k->idx = -1;
+ k->type = -1;
if (info->attrs[NL80211_ATTR_KEY])
err = nl80211_parse_key_new(info->attrs[NL80211_ATTR_KEY], k);
@@ -382,7 +449,7 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
} else if (parse.defmgmt)
goto error;
err = cfg80211_validate_key_settings(rdev, &parse.p,
- parse.idx, NULL);
+ parse.idx, false, NULL);
if (err)
goto error;
result->params[parse.idx].cipher = parse.p.cipher;
@@ -401,18 +468,17 @@ static int nl80211_key_allowed(struct wireless_dev *wdev)
{
ASSERT_WDEV_LOCK(wdev);
- if (!netif_running(wdev->netdev))
- return -ENETDOWN;
-
switch (wdev->iftype) {
case NL80211_IFTYPE_AP:
case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_P2P_GO:
break;
case NL80211_IFTYPE_ADHOC:
if (!wdev->current_bss)
return -ENOLINK;
break;
case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_P2P_CLIENT:
if (wdev->sme_state != CFG80211_SME_CONNECTED)
return -ENOLINK;
break;
@@ -437,6 +503,8 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
struct ieee80211_rate *rate;
int i;
u16 ifmodes = dev->wiphy.interface_modes;
+ const struct ieee80211_txrx_stypes *mgmt_stypes =
+ dev->wiphy.mgmt_stypes;
hdr = nl80211hdr_put(msg, pid, seq, flags, NL80211_CMD_NEW_WIPHY);
if (!hdr)
@@ -464,6 +532,9 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
NLA_PUT_U16(msg, NL80211_ATTR_MAX_SCAN_IE_LEN,
dev->wiphy.max_scan_ie_len);
+ if (dev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)
+ NLA_PUT_FLAG(msg, NL80211_ATTR_SUPPORT_IBSS_RSN);
+
NLA_PUT(msg, NL80211_ATTR_CIPHER_SUITES,
sizeof(u32) * dev->wiphy.n_cipher_suites,
dev->wiphy.cipher_suites);
@@ -471,6 +542,9 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_PMKIDS,
dev->wiphy.max_num_pmkids);
+ if (dev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL)
+ NLA_PUT_FLAG(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE);
+
nl_modes = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_IFTYPES);
if (!nl_modes)
goto nla_put_failure;
@@ -587,12 +661,13 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
CMD(flush_pmksa, FLUSH_PMKSA);
CMD(remain_on_channel, REMAIN_ON_CHANNEL);
CMD(set_bitrate_mask, SET_TX_BITRATE_MASK);
- CMD(action, ACTION);
+ CMD(mgmt_tx, FRAME);
if (dev->wiphy.flags & WIPHY_FLAG_NETNS_OK) {
i++;
NLA_PUT_U32(msg, i, NL80211_CMD_SET_WIPHY_NETNS);
}
CMD(set_channel, SET_CHANNEL);
+ CMD(set_wds_peer, SET_WDS_PEER);
#undef CMD
@@ -608,6 +683,55 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
nla_nest_end(msg, nl_cmds);
+ if (mgmt_stypes) {
+ u16 stypes;
+ struct nlattr *nl_ftypes, *nl_ifs;
+ enum nl80211_iftype ift;
+
+ nl_ifs = nla_nest_start(msg, NL80211_ATTR_TX_FRAME_TYPES);
+ if (!nl_ifs)
+ goto nla_put_failure;
+
+ for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) {
+ nl_ftypes = nla_nest_start(msg, ift);
+ if (!nl_ftypes)
+ goto nla_put_failure;
+ i = 0;
+ stypes = mgmt_stypes[ift].tx;
+ while (stypes) {
+ if (stypes & 1)
+ NLA_PUT_U16(msg, NL80211_ATTR_FRAME_TYPE,
+ (i << 4) | IEEE80211_FTYPE_MGMT);
+ stypes >>= 1;
+ i++;
+ }
+ nla_nest_end(msg, nl_ftypes);
+ }
+
+ nla_nest_end(msg, nl_ifs);
+
+ nl_ifs = nla_nest_start(msg, NL80211_ATTR_RX_FRAME_TYPES);
+ if (!nl_ifs)
+ goto nla_put_failure;
+
+ for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) {
+ nl_ftypes = nla_nest_start(msg, ift);
+ if (!nl_ftypes)
+ goto nla_put_failure;
+ i = 0;
+ stypes = mgmt_stypes[ift].rx;
+ while (stypes) {
+ if (stypes & 1)
+ NLA_PUT_U16(msg, NL80211_ATTR_FRAME_TYPE,
+ (i << 4) | IEEE80211_FTYPE_MGMT);
+ stypes >>= 1;
+ i++;
+ }
+ nla_nest_end(msg, nl_ftypes);
+ }
+ nla_nest_end(msg, nl_ifs);
+ }
+
return genlmsg_end(msg, hdr);
nla_put_failure:
@@ -644,28 +768,18 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb)
static int nl80211_get_wiphy(struct sk_buff *skb, struct genl_info *info)
{
struct sk_buff *msg;
- struct cfg80211_registered_device *dev;
-
- dev = cfg80211_get_dev_from_info(info);
- if (IS_ERR(dev))
- return PTR_ERR(dev);
+ struct cfg80211_registered_device *dev = info->user_ptr[0];
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
- goto out_err;
-
- if (nl80211_send_wiphy(msg, info->snd_pid, info->snd_seq, 0, dev) < 0)
- goto out_free;
+ return -ENOMEM;
- cfg80211_unlock_rdev(dev);
+ if (nl80211_send_wiphy(msg, info->snd_pid, info->snd_seq, 0, dev) < 0) {
+ nlmsg_free(msg);
+ return -ENOBUFS;
+ }
return genlmsg_reply(msg, info);
-
- out_free:
- nlmsg_free(msg);
- out_err:
- cfg80211_unlock_rdev(dev);
- return -ENOBUFS;
}
static const struct nla_policy txq_params_policy[NL80211_TXQ_ATTR_MAX + 1] = {
@@ -709,7 +823,8 @@ static bool nl80211_can_set_dev_channel(struct wireless_dev *wdev)
wdev->iftype == NL80211_IFTYPE_AP ||
wdev->iftype == NL80211_IFTYPE_WDS ||
wdev->iftype == NL80211_IFTYPE_MESH_POINT ||
- wdev->iftype == NL80211_IFTYPE_MONITOR;
+ wdev->iftype == NL80211_IFTYPE_MONITOR ||
+ wdev->iftype == NL80211_IFTYPE_P2P_GO;
}
static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
@@ -753,38 +868,48 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- struct net_device *netdev;
- int result;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *netdev = info->user_ptr[1];
- rtnl_lock();
+ return __nl80211_set_channel(rdev, netdev->ieee80211_ptr, info);
+}
- result = get_rdev_dev_by_info_ifindex(info, &rdev, &netdev);
- if (result)
- goto unlock;
+static int nl80211_set_wds_peer(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ const u8 *bssid;
- result = __nl80211_set_channel(rdev, netdev->ieee80211_ptr, info);
+ if (!info->attrs[NL80211_ATTR_MAC])
+ return -EINVAL;
- unlock:
- rtnl_unlock();
+ if (netif_running(dev))
+ return -EBUSY;
- return result;
+ if (!rdev->ops->set_wds_peer)
+ return -EOPNOTSUPP;
+
+ if (wdev->iftype != NL80211_IFTYPE_WDS)
+ return -EOPNOTSUPP;
+
+ bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+ return rdev->ops->set_wds_peer(wdev->wiphy, dev, bssid);
}
+
static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
{
struct cfg80211_registered_device *rdev;
struct net_device *netdev = NULL;
struct wireless_dev *wdev;
- int result, rem_txq_params = 0;
+ int result = 0, rem_txq_params = 0;
struct nlattr *nl_txq_params;
u32 changed;
u8 retry_short = 0, retry_long = 0;
u32 frag_threshold = 0, rts_threshold = 0;
u8 coverage_class = 0;
- rtnl_lock();
-
/*
* Try to find the wiphy and netdev. Normally this
* function shouldn't need the netdev, but this is
@@ -811,8 +936,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
rdev = __cfg80211_rdev_from_info(info);
if (IS_ERR(rdev)) {
mutex_unlock(&cfg80211_mutex);
- result = PTR_ERR(rdev);
- goto unlock;
+ return PTR_ERR(rdev);
}
wdev = NULL;
netdev = NULL;
@@ -994,8 +1118,6 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
mutex_unlock(&rdev->mtx);
if (netdev)
dev_put(netdev);
- unlock:
- rtnl_unlock();
return result;
}
@@ -1075,33 +1197,20 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info)
{
struct sk_buff *msg;
- struct cfg80211_registered_device *dev;
- struct net_device *netdev;
- int err;
-
- err = get_rdev_dev_by_info_ifindex(info, &dev, &netdev);
- if (err)
- return err;
+ struct cfg80211_registered_device *dev = info->user_ptr[0];
+ struct net_device *netdev = info->user_ptr[1];
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
- goto out_err;
+ return -ENOMEM;
if (nl80211_send_iface(msg, info->snd_pid, info->snd_seq, 0,
- dev, netdev) < 0)
- goto out_free;
-
- dev_put(netdev);
- cfg80211_unlock_rdev(dev);
+ dev, netdev) < 0) {
+ nlmsg_free(msg);
+ return -ENOBUFS;
+ }
return genlmsg_reply(msg, info);
-
- out_free:
- nlmsg_free(msg);
- out_err:
- dev_put(netdev);
- cfg80211_unlock_rdev(dev);
- return -ENOBUFS;
}
static const struct nla_policy mntr_flags_policy[NL80211_MNTR_FLAG_MAX + 1] = {
@@ -1161,39 +1270,29 @@ static int nl80211_valid_4addr(struct cfg80211_registered_device *rdev,
static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
struct vif_params params;
int err;
enum nl80211_iftype otype, ntype;
- struct net_device *dev;
+ struct net_device *dev = info->user_ptr[1];
u32 _flags, *flags = NULL;
bool change = false;
memset(&params, 0, sizeof(params));
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
otype = ntype = dev->ieee80211_ptr->iftype;
if (info->attrs[NL80211_ATTR_IFTYPE]) {
ntype = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]);
if (otype != ntype)
change = true;
- if (ntype > NL80211_IFTYPE_MAX) {
- err = -EINVAL;
- goto unlock;
- }
+ if (ntype > NL80211_IFTYPE_MAX)
+ return -EINVAL;
}
if (info->attrs[NL80211_ATTR_MESH_ID]) {
- if (ntype != NL80211_IFTYPE_MESH_POINT) {
- err = -EINVAL;
- goto unlock;
- }
+ if (ntype != NL80211_IFTYPE_MESH_POINT)
+ return -EINVAL;
params.mesh_id = nla_data(info->attrs[NL80211_ATTR_MESH_ID]);
params.mesh_id_len = nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
change = true;
@@ -1204,20 +1303,18 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
change = true;
err = nl80211_valid_4addr(rdev, dev, params.use_4addr, ntype);
if (err)
- goto unlock;
+ return err;
} else {
params.use_4addr = -1;
}
if (info->attrs[NL80211_ATTR_MNTR_FLAGS]) {
- if (ntype != NL80211_IFTYPE_MONITOR) {
- err = -EINVAL;
- goto unlock;
- }
+ if (ntype != NL80211_IFTYPE_MONITOR)
+ return -EINVAL;
err = parse_monitor_flags(info->attrs[NL80211_ATTR_MNTR_FLAGS],
&_flags);
if (err)
- goto unlock;
+ return err;
flags = &_flags;
change = true;
@@ -1231,17 +1328,12 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
if (!err && params.use_4addr != -1)
dev->ieee80211_ptr->use_4addr = params.use_4addr;
- unlock:
- dev_put(dev);
- cfg80211_unlock_rdev(rdev);
- unlock_rtnl:
- rtnl_unlock();
return err;
}
static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
struct vif_params params;
int err;
enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED;
@@ -1258,19 +1350,9 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
}
- rtnl_lock();
-
- rdev = cfg80211_get_dev_from_info(info);
- if (IS_ERR(rdev)) {
- err = PTR_ERR(rdev);
- goto unlock_rtnl;
- }
-
if (!rdev->ops->add_virtual_intf ||
- !(rdev->wiphy.interface_modes & (1 << type))) {
- err = -EOPNOTSUPP;
- goto unlock;
- }
+ !(rdev->wiphy.interface_modes & (1 << type)))
+ return -EOPNOTSUPP;
if (type == NL80211_IFTYPE_MESH_POINT &&
info->attrs[NL80211_ATTR_MESH_ID]) {
@@ -1282,7 +1364,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
params.use_4addr = !!nla_get_u8(info->attrs[NL80211_ATTR_4ADDR]);
err = nl80211_valid_4addr(rdev, NULL, params.use_4addr, type);
if (err)
- goto unlock;
+ return err;
}
err = parse_monitor_flags(type == NL80211_IFTYPE_MONITOR ?
@@ -1292,38 +1374,18 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
nla_data(info->attrs[NL80211_ATTR_IFNAME]),
type, err ? NULL : &flags, &params);
- unlock:
- cfg80211_unlock_rdev(rdev);
- unlock_rtnl:
- rtnl_unlock();
return err;
}
static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- int err;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
- if (!rdev->ops->del_virtual_intf) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- err = rdev->ops->del_virtual_intf(&rdev->wiphy, dev);
+ if (!rdev->ops->del_virtual_intf)
+ return -EOPNOTSUPP;
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- unlock_rtnl:
- rtnl_unlock();
- return err;
+ return rdev->ops->del_virtual_intf(&rdev->wiphy, dev);
}
struct get_key_cookie {
@@ -1376,11 +1438,12 @@ static void get_key_callback(void *c, struct key_params *params)
static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
int err;
- struct net_device *dev;
+ struct net_device *dev = info->user_ptr[1];
u8 key_idx = 0;
- u8 *mac_addr = NULL;
+ const u8 *mac_addr = NULL;
+ bool pairwise;
struct get_key_cookie cookie = {
.error = 0,
};
@@ -1396,30 +1459,28 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_MAC])
mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
- if (!rdev->ops->get_key) {
- err = -EOPNOTSUPP;
- goto out;
+ pairwise = !!mac_addr;
+ if (info->attrs[NL80211_ATTR_KEY_TYPE]) {
+ u32 kt = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]);
+ if (kt >= NUM_NL80211_KEYTYPES)
+ return -EINVAL;
+ if (kt != NL80211_KEYTYPE_GROUP &&
+ kt != NL80211_KEYTYPE_PAIRWISE)
+ return -EINVAL;
+ pairwise = kt == NL80211_KEYTYPE_PAIRWISE;
}
+ if (!rdev->ops->get_key)
+ return -EOPNOTSUPP;
+
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg) {
- err = -ENOMEM;
- goto out;
- }
+ if (!msg)
+ return -ENOMEM;
hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
NL80211_CMD_NEW_KEY);
-
- if (IS_ERR(hdr)) {
- err = PTR_ERR(hdr);
- goto free_msg;
- }
+ if (IS_ERR(hdr))
+ return PTR_ERR(hdr);
cookie.msg = msg;
cookie.idx = key_idx;
@@ -1429,8 +1490,12 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
if (mac_addr)
NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr);
- err = rdev->ops->get_key(&rdev->wiphy, dev, key_idx, mac_addr,
- &cookie, get_key_callback);
+ if (pairwise && mac_addr &&
+ !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
+ return -ENOENT;
+
+ err = rdev->ops->get_key(&rdev->wiphy, dev, key_idx, pairwise,
+ mac_addr, &cookie, get_key_callback);
if (err)
goto free_msg;
@@ -1439,28 +1504,21 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
goto nla_put_failure;
genlmsg_end(msg, hdr);
- err = genlmsg_reply(msg, info);
- goto out;
+ return genlmsg_reply(msg, info);
nla_put_failure:
err = -ENOBUFS;
free_msg:
nlmsg_free(msg);
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- unlock_rtnl:
- rtnl_unlock();
-
return err;
}
static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
struct key_parse key;
int err;
- struct net_device *dev;
+ struct net_device *dev = info->user_ptr[1];
int (*func)(struct wiphy *wiphy, struct net_device *netdev,
u8 key_index);
@@ -1475,21 +1533,13 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
if (!key.def && !key.defmgmt)
return -EINVAL;
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
if (key.def)
func = rdev->ops->set_default_key;
else
func = rdev->ops->set_default_mgmt_key;
- if (!func) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (!func)
+ return -EOPNOTSUPP;
wdev_lock(dev->ieee80211_ptr);
err = nl80211_key_allowed(dev->ieee80211_ptr);
@@ -1506,23 +1556,16 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
#endif
wdev_unlock(dev->ieee80211_ptr);
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
-
- unlock_rtnl:
- rtnl_unlock();
-
return err;
}
static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
int err;
- struct net_device *dev;
+ struct net_device *dev = info->user_ptr[1];
struct key_parse key;
- u8 *mac_addr = NULL;
+ const u8 *mac_addr = NULL;
err = nl80211_parse_key(info, &key);
if (err)
@@ -1534,43 +1577,42 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_MAC])
mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
- rtnl_lock();
+ if (key.type == -1) {
+ if (mac_addr)
+ key.type = NL80211_KEYTYPE_PAIRWISE;
+ else
+ key.type = NL80211_KEYTYPE_GROUP;
+ }
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
+ /* for now */
+ if (key.type != NL80211_KEYTYPE_PAIRWISE &&
+ key.type != NL80211_KEYTYPE_GROUP)
+ return -EINVAL;
- if (!rdev->ops->add_key) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (!rdev->ops->add_key)
+ return -EOPNOTSUPP;
- if (cfg80211_validate_key_settings(rdev, &key.p, key.idx, mac_addr)) {
- err = -EINVAL;
- goto out;
- }
+ if (cfg80211_validate_key_settings(rdev, &key.p, key.idx,
+ key.type == NL80211_KEYTYPE_PAIRWISE,
+ mac_addr))
+ return -EINVAL;
wdev_lock(dev->ieee80211_ptr);
err = nl80211_key_allowed(dev->ieee80211_ptr);
if (!err)
err = rdev->ops->add_key(&rdev->wiphy, dev, key.idx,
+ key.type == NL80211_KEYTYPE_PAIRWISE,
mac_addr, &key.p);
wdev_unlock(dev->ieee80211_ptr);
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- unlock_rtnl:
- rtnl_unlock();
-
return err;
}
static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
int err;
- struct net_device *dev;
+ struct net_device *dev = info->user_ptr[1];
u8 *mac_addr = NULL;
struct key_parse key;
@@ -1581,21 +1623,32 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_MAC])
mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
- rtnl_lock();
+ if (key.type == -1) {
+ if (mac_addr)
+ key.type = NL80211_KEYTYPE_PAIRWISE;
+ else
+ key.type = NL80211_KEYTYPE_GROUP;
+ }
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
+ /* for now */
+ if (key.type != NL80211_KEYTYPE_PAIRWISE &&
+ key.type != NL80211_KEYTYPE_GROUP)
+ return -EINVAL;
- if (!rdev->ops->del_key) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (!rdev->ops->del_key)
+ return -EOPNOTSUPP;
wdev_lock(dev->ieee80211_ptr);
err = nl80211_key_allowed(dev->ieee80211_ptr);
+
+ if (key.type == NL80211_KEYTYPE_PAIRWISE && mac_addr &&
+ !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
+ err = -ENOENT;
+
if (!err)
- err = rdev->ops->del_key(&rdev->wiphy, dev, key.idx, mac_addr);
+ err = rdev->ops->del_key(&rdev->wiphy, dev, key.idx,
+ key.type == NL80211_KEYTYPE_PAIRWISE,
+ mac_addr);
#ifdef CONFIG_CFG80211_WEXT
if (!err) {
@@ -1607,13 +1660,6 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
#endif
wdev_unlock(dev->ieee80211_ptr);
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
-
- unlock_rtnl:
- rtnl_unlock();
-
return err;
}
@@ -1621,35 +1667,25 @@ static int nl80211_addset_beacon(struct sk_buff *skb, struct genl_info *info)
{
int (*call)(struct wiphy *wiphy, struct net_device *dev,
struct beacon_parameters *info);
- struct cfg80211_registered_device *rdev;
- int err;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
struct beacon_parameters params;
int haveinfo = 0;
if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_BEACON_TAIL]))
return -EINVAL;
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
- if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EOPNOTSUPP;
switch (info->genlhdr->cmd) {
case NL80211_CMD_NEW_BEACON:
/* these are required for NEW_BEACON */
if (!info->attrs[NL80211_ATTR_BEACON_INTERVAL] ||
!info->attrs[NL80211_ATTR_DTIM_PERIOD] ||
- !info->attrs[NL80211_ATTR_BEACON_HEAD]) {
- err = -EINVAL;
- goto out;
- }
+ !info->attrs[NL80211_ATTR_BEACON_HEAD])
+ return -EINVAL;
call = rdev->ops->add_beacon;
break;
@@ -1658,14 +1694,11 @@ static int nl80211_addset_beacon(struct sk_buff *skb, struct genl_info *info)
break;
default:
WARN_ON(1);
- err = -EOPNOTSUPP;
- goto out;
+ return -EOPNOTSUPP;
}
- if (!call) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (!call)
+ return -EOPNOTSUPP;
memset(&params, 0, sizeof(params));
@@ -1695,52 +1728,25 @@ static int nl80211_addset_beacon(struct sk_buff *skb, struct genl_info *info)
haveinfo = 1;
}
- if (!haveinfo) {
- err = -EINVAL;
- goto out;
- }
-
- err = call(&rdev->wiphy, dev, &params);
-
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- unlock_rtnl:
- rtnl_unlock();
+ if (!haveinfo)
+ return -EINVAL;
- return err;
+ return call(&rdev->wiphy, dev, &params);
}
static int nl80211_del_beacon(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- int err;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
- if (!rdev->ops->del_beacon) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP) {
- err = -EOPNOTSUPP;
- goto out;
- }
- err = rdev->ops->del_beacon(&rdev->wiphy, dev);
+ if (!rdev->ops->del_beacon)
+ return -EOPNOTSUPP;
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- unlock_rtnl:
- rtnl_unlock();
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EOPNOTSUPP;
- return err;
+ return rdev->ops->del_beacon(&rdev->wiphy, dev);
}
static const struct nla_policy sta_flags_policy[NL80211_STA_FLAG_MAX + 1] = {
@@ -1861,6 +1867,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 pid, u32 seq,
if (sinfo->filled & STATION_INFO_TX_PACKETS)
NLA_PUT_U32(msg, NL80211_STA_INFO_TX_PACKETS,
sinfo->tx_packets);
+ if (sinfo->filled & STATION_INFO_TX_RETRIES)
+ NLA_PUT_U32(msg, NL80211_STA_INFO_TX_RETRIES,
+ sinfo->tx_retries);
+ if (sinfo->filled & STATION_INFO_TX_FAILED)
+ NLA_PUT_U32(msg, NL80211_STA_INFO_TX_FAILED,
+ sinfo->tx_failed);
nla_nest_end(msg, sinfoattr);
return genlmsg_end(msg, hdr);
@@ -1877,28 +1889,12 @@ static int nl80211_dump_station(struct sk_buff *skb,
struct cfg80211_registered_device *dev;
struct net_device *netdev;
u8 mac_addr[ETH_ALEN];
- int ifidx = cb->args[0];
int sta_idx = cb->args[1];
int err;
- if (!ifidx)
- ifidx = nl80211_get_ifidx(cb);
- if (ifidx < 0)
- return ifidx;
-
- rtnl_lock();
-
- netdev = __dev_get_by_index(sock_net(skb->sk), ifidx);
- if (!netdev) {
- err = -ENODEV;
- goto out_rtnl;
- }
-
- dev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx);
- if (IS_ERR(dev)) {
- err = PTR_ERR(dev);
- goto out_rtnl;
- }
+ err = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev);
+ if (err)
+ return err;
if (!dev->ops->dump_station) {
err = -EOPNOTSUPP;
@@ -1928,21 +1924,19 @@ static int nl80211_dump_station(struct sk_buff *skb,
cb->args[1] = sta_idx;
err = skb->len;
out_err:
- cfg80211_unlock_rdev(dev);
- out_rtnl:
- rtnl_unlock();
+ nl80211_finish_netdev_dump(dev);
return err;
}
static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- int err;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
struct station_info sinfo;
struct sk_buff *msg;
u8 *mac_addr = NULL;
+ int err;
memset(&sinfo, 0, sizeof(sinfo));
@@ -1951,41 +1945,24 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto out_rtnl;
-
- if (!rdev->ops->get_station) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (!rdev->ops->get_station)
+ return -EOPNOTSUPP;
err = rdev->ops->get_station(&rdev->wiphy, dev, mac_addr, &sinfo);
if (err)
- goto out;
+ return err;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
- goto out;
+ return -ENOMEM;
if (nl80211_send_station(msg, info->snd_pid, info->snd_seq, 0,
- dev, mac_addr, &sinfo) < 0)
- goto out_free;
-
- err = genlmsg_reply(msg, info);
- goto out;
-
- out_free:
- nlmsg_free(msg);
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- out_rtnl:
- rtnl_unlock();
+ dev, mac_addr, &sinfo) < 0) {
+ nlmsg_free(msg);
+ return -ENOBUFS;
+ }
- return err;
+ return genlmsg_reply(msg, info);
}
/*
@@ -2015,9 +1992,9 @@ static int get_vlan(struct genl_info *info,
static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
int err;
- struct net_device *dev;
+ struct net_device *dev = info->user_ptr[1];
struct station_parameters params;
u8 *mac_addr = NULL;
@@ -2055,12 +2032,6 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
params.plink_action =
nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]);
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto out_rtnl;
-
err = get_vlan(info, rdev, &params.vlan);
if (err)
goto out;
@@ -2071,10 +2042,12 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
switch (dev->ieee80211_ptr->iftype) {
case NL80211_IFTYPE_AP:
case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_P2P_GO:
/* disallow mesh-specific things */
if (params.plink_action)
err = -EINVAL;
break;
+ case NL80211_IFTYPE_P2P_CLIENT:
case NL80211_IFTYPE_STATION:
/* disallow everything but AUTHORIZED flag */
if (params.plink_action)
@@ -2120,19 +2093,15 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
out:
if (params.vlan)
dev_put(params.vlan);
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- out_rtnl:
- rtnl_unlock();
return err;
}
static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
int err;
- struct net_device *dev;
+ struct net_device *dev = info->user_ptr[1];
struct station_parameters params;
u8 *mac_addr = NULL;
@@ -2169,17 +2138,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
if (parse_station_flags(info, &params))
return -EINVAL;
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto out_rtnl;
-
if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
- dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN) {
- err = -EINVAL;
- goto out;
- }
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EINVAL;
err = get_vlan(info, rdev, &params.vlan);
if (err)
@@ -2193,61 +2155,33 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
goto out;
}
- if (!netif_running(dev)) {
- err = -ENETDOWN;
- goto out;
- }
-
err = rdev->ops->add_station(&rdev->wiphy, dev, mac_addr, &params);
out:
if (params.vlan)
dev_put(params.vlan);
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- out_rtnl:
- rtnl_unlock();
-
return err;
}
static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- int err;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
u8 *mac_addr = NULL;
if (info->attrs[NL80211_ATTR_MAC])
mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto out_rtnl;
-
if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
- dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) {
- err = -EINVAL;
- goto out;
- }
-
- if (!rdev->ops->del_station) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- err = rdev->ops->del_station(&rdev->wiphy, dev, mac_addr);
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EINVAL;
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- out_rtnl:
- rtnl_unlock();
+ if (!rdev->ops->del_station)
+ return -EOPNOTSUPP;
- return err;
+ return rdev->ops->del_station(&rdev->wiphy, dev, mac_addr);
}
static int nl80211_send_mpath(struct sk_buff *msg, u32 pid, u32 seq,
@@ -2310,28 +2244,12 @@ static int nl80211_dump_mpath(struct sk_buff *skb,
struct net_device *netdev;
u8 dst[ETH_ALEN];
u8 next_hop[ETH_ALEN];
- int ifidx = cb->args[0];
int path_idx = cb->args[1];
int err;
- if (!ifidx)
- ifidx = nl80211_get_ifidx(cb);
- if (ifidx < 0)
- return ifidx;
-
- rtnl_lock();
-
- netdev = __dev_get_by_index(sock_net(skb->sk), ifidx);
- if (!netdev) {
- err = -ENODEV;
- goto out_rtnl;
- }
-
- dev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx);
- if (IS_ERR(dev)) {
- err = PTR_ERR(dev);
- goto out_rtnl;
- }
+ err = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev);
+ if (err)
+ return err;
if (!dev->ops->dump_mpath) {
err = -EOPNOTSUPP;
@@ -2365,18 +2283,15 @@ static int nl80211_dump_mpath(struct sk_buff *skb,
cb->args[1] = path_idx;
err = skb->len;
out_err:
- cfg80211_unlock_rdev(dev);
- out_rtnl:
- rtnl_unlock();
-
+ nl80211_finish_netdev_dump(dev);
return err;
}
static int nl80211_get_mpath(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
int err;
- struct net_device *dev;
+ struct net_device *dev = info->user_ptr[1];
struct mpath_info pinfo;
struct sk_buff *msg;
u8 *dst = NULL;
@@ -2389,53 +2304,33 @@ static int nl80211_get_mpath(struct sk_buff *skb, struct genl_info *info)
dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto out_rtnl;
-
- if (!rdev->ops->get_mpath) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (!rdev->ops->get_mpath)
+ return -EOPNOTSUPP;
- if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+ return -EOPNOTSUPP;
err = rdev->ops->get_mpath(&rdev->wiphy, dev, dst, next_hop, &pinfo);
if (err)
- goto out;
+ return err;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
- goto out;
+ return -ENOMEM;
if (nl80211_send_mpath(msg, info->snd_pid, info->snd_seq, 0,
- dev, dst, next_hop, &pinfo) < 0)
- goto out_free;
-
- err = genlmsg_reply(msg, info);
- goto out;
-
- out_free:
- nlmsg_free(msg);
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- out_rtnl:
- rtnl_unlock();
+ dev, dst, next_hop, &pinfo) < 0) {
+ nlmsg_free(msg);
+ return -ENOBUFS;
+ }
- return err;
+ return genlmsg_reply(msg, info);
}
static int nl80211_set_mpath(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- int err;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
u8 *dst = NULL;
u8 *next_hop = NULL;
@@ -2448,42 +2343,19 @@ static int nl80211_set_mpath(struct sk_buff *skb, struct genl_info *info)
dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
next_hop = nla_data(info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]);
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto out_rtnl;
-
- if (!rdev->ops->change_mpath) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- if (!netif_running(dev)) {
- err = -ENETDOWN;
- goto out;
- }
-
- err = rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop);
+ if (!rdev->ops->change_mpath)
+ return -EOPNOTSUPP;
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- out_rtnl:
- rtnl_unlock();
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+ return -EOPNOTSUPP;
- return err;
+ return rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop);
}
+
static int nl80211_new_mpath(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- int err;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
u8 *dst = NULL;
u8 *next_hop = NULL;
@@ -2496,75 +2368,34 @@ static int nl80211_new_mpath(struct sk_buff *skb, struct genl_info *info)
dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
next_hop = nla_data(info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]);
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto out_rtnl;
-
- if (!rdev->ops->add_mpath) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- if (!netif_running(dev)) {
- err = -ENETDOWN;
- goto out;
- }
-
- err = rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop);
+ if (!rdev->ops->add_mpath)
+ return -EOPNOTSUPP;
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- out_rtnl:
- rtnl_unlock();
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+ return -EOPNOTSUPP;
- return err;
+ return rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop);
}
static int nl80211_del_mpath(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- int err;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
u8 *dst = NULL;
if (info->attrs[NL80211_ATTR_MAC])
dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto out_rtnl;
-
- if (!rdev->ops->del_mpath) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- err = rdev->ops->del_mpath(&rdev->wiphy, dev, dst);
-
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- out_rtnl:
- rtnl_unlock();
+ if (!rdev->ops->del_mpath)
+ return -EOPNOTSUPP;
- return err;
+ return rdev->ops->del_mpath(&rdev->wiphy, dev, dst);
}
static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- int err;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
struct bss_parameters params;
memset(&params, 0, sizeof(params));
@@ -2592,31 +2423,14 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_AP_ISOLATE])
params.ap_isolate = !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]);
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto out_rtnl;
-
- if (!rdev->ops->change_bss) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- err = rdev->ops->change_bss(&rdev->wiphy, dev, &params);
+ if (!rdev->ops->change_bss)
+ return -EOPNOTSUPP;
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- out_rtnl:
- rtnl_unlock();
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EOPNOTSUPP;
- return err;
+ return rdev->ops->change_bss(&rdev->wiphy, dev, &params);
}
static const struct nla_policy reg_rule_policy[NL80211_REG_RULE_ATTR_MAX + 1] = {
@@ -2695,37 +2509,26 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
static int nl80211_get_mesh_params(struct sk_buff *skb,
struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
struct mesh_config cur_params;
int err;
- struct net_device *dev;
+ struct net_device *dev = info->user_ptr[1];
void *hdr;
struct nlattr *pinfoattr;
struct sk_buff *msg;
- rtnl_lock();
-
- /* Look up our device */
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto out_rtnl;
-
- if (!rdev->ops->get_mesh_params) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (!rdev->ops->get_mesh_params)
+ return -EOPNOTSUPP;
/* Get the mesh params */
err = rdev->ops->get_mesh_params(&rdev->wiphy, dev, &cur_params);
if (err)
- goto out;
+ return err;
/* Draw up a netlink message to send back */
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg) {
- err = -ENOBUFS;
- goto out;
- }
+ if (!msg)
+ return -ENOMEM;
hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
NL80211_CMD_GET_MESH_PARAMS);
if (!hdr)
@@ -2764,21 +2567,12 @@ static int nl80211_get_mesh_params(struct sk_buff *skb,
cur_params.dot11MeshHWMPRootMode);
nla_nest_end(msg, pinfoattr);
genlmsg_end(msg, hdr);
- err = genlmsg_reply(msg, info);
- goto out;
+ return genlmsg_reply(msg, info);
nla_put_failure:
genlmsg_cancel(msg, hdr);
nlmsg_free(msg);
- err = -EMSGSIZE;
- out:
- /* Cleanup */
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- out_rtnl:
- rtnl_unlock();
-
- return err;
+ return -ENOBUFS;
}
#define FILL_IN_MESH_PARAM_IF_SET(table, cfg, param, mask, attr_num, nla_fn) \
@@ -2808,10 +2602,9 @@ static const struct nla_policy nl80211_meshconf_params_policy[NL80211_MESHCONF_A
static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info)
{
- int err;
u32 mask;
- struct cfg80211_registered_device *rdev;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
struct mesh_config cfg;
struct nlattr *tb[NL80211_MESHCONF_ATTR_MAX + 1];
struct nlattr *parent_attr;
@@ -2823,16 +2616,8 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info)
parent_attr, nl80211_meshconf_params_policy))
return -EINVAL;
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto out_rtnl;
-
- if (!rdev->ops->set_mesh_params) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (!rdev->ops->set_mesh_params)
+ return -EOPNOTSUPP;
/* This makes sure that there aren't more than 32 mesh config
* parameters (otherwise our bitfield scheme would not work.) */
@@ -2878,16 +2663,7 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info)
nla_get_u8);
/* Apply changes */
- err = rdev->ops->set_mesh_params(&rdev->wiphy, dev, &cfg, mask);
-
- out:
- /* cleanup */
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- out_rtnl:
- rtnl_unlock();
-
- return err;
+ return rdev->ops->set_mesh_params(&rdev->wiphy, dev, &cfg, mask);
}
#undef FILL_IN_MESH_PARAM_IF_SET
@@ -3070,8 +2846,8 @@ static int validate_scan_freqs(struct nlattr *freqs)
static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
struct cfg80211_scan_request *request;
struct cfg80211_ssid *ssid;
struct ieee80211_channel *channel;
@@ -3084,36 +2860,19 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
return -EINVAL;
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto out_rtnl;
-
wiphy = &rdev->wiphy;
- if (!rdev->ops->scan) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- if (!netif_running(dev)) {
- err = -ENETDOWN;
- goto out;
- }
+ if (!rdev->ops->scan)
+ return -EOPNOTSUPP;
- if (rdev->scan_req) {
- err = -EBUSY;
- goto out;
- }
+ if (rdev->scan_req)
+ return -EBUSY;
if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
n_channels = validate_scan_freqs(
info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]);
- if (!n_channels) {
- err = -EINVAL;
- goto out;
- }
+ if (!n_channels)
+ return -EINVAL;
} else {
n_channels = 0;
@@ -3126,29 +2885,23 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp)
n_ssids++;
- if (n_ssids > wiphy->max_scan_ssids) {
- err = -EINVAL;
- goto out;
- }
+ if (n_ssids > wiphy->max_scan_ssids)
+ return -EINVAL;
if (info->attrs[NL80211_ATTR_IE])
ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
else
ie_len = 0;
- if (ie_len > wiphy->max_scan_ie_len) {
- err = -EINVAL;
- goto out;
- }
+ if (ie_len > wiphy->max_scan_ie_len)
+ return -EINVAL;
request = kzalloc(sizeof(*request)
+ sizeof(*ssid) * n_ssids
+ sizeof(channel) * n_channels
+ ie_len, GFP_KERNEL);
- if (!request) {
- err = -ENOMEM;
- goto out;
- }
+ if (!request)
+ return -ENOMEM;
if (n_ssids)
request->ssids = (void *)&request->channels[n_channels];
@@ -3236,18 +2989,11 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
if (!err) {
nl80211_send_scan_start(rdev, dev);
dev_hold(dev);
- }
-
+ } else {
out_free:
- if (err) {
rdev->scan_req = NULL;
kfree(request);
}
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- out_rtnl:
- rtnl_unlock();
return err;
}
@@ -3306,6 +3052,7 @@ static int nl80211_send_bss(struct sk_buff *msg, u32 pid, u32 seq, int flags,
}
switch (wdev->iftype) {
+ case NL80211_IFTYPE_P2P_CLIENT:
case NL80211_IFTYPE_STATION:
if (intbss == wdev->current_bss)
NLA_PUT_U32(msg, NL80211_BSS_STATUS,
@@ -3343,25 +3090,12 @@ static int nl80211_dump_scan(struct sk_buff *skb,
struct net_device *dev;
struct cfg80211_internal_bss *scan;
struct wireless_dev *wdev;
- int ifidx = cb->args[0];
int start = cb->args[1], idx = 0;
int err;
- if (!ifidx)
- ifidx = nl80211_get_ifidx(cb);
- if (ifidx < 0)
- return ifidx;
- cb->args[0] = ifidx;
-
- dev = dev_get_by_index(sock_net(skb->sk), ifidx);
- if (!dev)
- return -ENODEV;
-
- rdev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx);
- if (IS_ERR(rdev)) {
- err = PTR_ERR(rdev);
- goto out_put_netdev;
- }
+ err = nl80211_prepare_netdev_dump(skb, cb, &rdev, &dev);
+ if (err)
+ return err;
wdev = dev->ieee80211_ptr;
@@ -3377,21 +3111,17 @@ static int nl80211_dump_scan(struct sk_buff *skb,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
rdev, wdev, scan) < 0) {
idx--;
- goto out;
+ break;
}
}
- out:
spin_unlock_bh(&rdev->bss_lock);
wdev_unlock(wdev);
cb->args[1] = idx;
- err = skb->len;
- cfg80211_unlock_rdev(rdev);
- out_put_netdev:
- dev_put(dev);
+ nl80211_finish_netdev_dump(rdev);
- return err;
+ return skb->len;
}
static int nl80211_send_survey(struct sk_buff *msg, u32 pid, u32 seq,
@@ -3421,6 +3151,23 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 pid, u32 seq,
if (survey->filled & SURVEY_INFO_NOISE_DBM)
NLA_PUT_U8(msg, NL80211_SURVEY_INFO_NOISE,
survey->noise);
+ if (survey->filled & SURVEY_INFO_IN_USE)
+ NLA_PUT_FLAG(msg, NL80211_SURVEY_INFO_IN_USE);
+ if (survey->filled & SURVEY_INFO_CHANNEL_TIME)
+ NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME,
+ survey->channel_time);
+ if (survey->filled & SURVEY_INFO_CHANNEL_TIME_BUSY)
+ NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_BUSY,
+ survey->channel_time_busy);
+ if (survey->filled & SURVEY_INFO_CHANNEL_TIME_EXT_BUSY)
+ NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_EXT_BUSY,
+ survey->channel_time_ext_busy);
+ if (survey->filled & SURVEY_INFO_CHANNEL_TIME_RX)
+ NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_RX,
+ survey->channel_time_rx);
+ if (survey->filled & SURVEY_INFO_CHANNEL_TIME_TX)
+ NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_TX,
+ survey->channel_time_tx);
nla_nest_end(msg, infoattr);
@@ -3437,29 +3184,12 @@ static int nl80211_dump_survey(struct sk_buff *skb,
struct survey_info survey;
struct cfg80211_registered_device *dev;
struct net_device *netdev;
- int ifidx = cb->args[0];
int survey_idx = cb->args[1];
int res;
- if (!ifidx)
- ifidx = nl80211_get_ifidx(cb);
- if (ifidx < 0)
- return ifidx;
- cb->args[0] = ifidx;
-
- rtnl_lock();
-
- netdev = __dev_get_by_index(sock_net(skb->sk), ifidx);
- if (!netdev) {
- res = -ENODEV;
- goto out_rtnl;
- }
-
- dev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx);
- if (IS_ERR(dev)) {
- res = PTR_ERR(dev);
- goto out_rtnl;
- }
+ res = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev);
+ if (res)
+ return res;
if (!dev->ops->dump_survey) {
res = -EOPNOTSUPP;
@@ -3487,10 +3217,7 @@ static int nl80211_dump_survey(struct sk_buff *skb,
cb->args[1] = survey_idx;
res = skb->len;
out_err:
- cfg80211_unlock_rdev(dev);
- out_rtnl:
- rtnl_unlock();
-
+ nl80211_finish_netdev_dump(dev);
return res;
}
@@ -3523,8 +3250,8 @@ static bool nl80211_valid_cipher_suite(u32 cipher)
static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
struct ieee80211_channel *chan;
const u8 *bssid, *ssid, *ie = NULL;
int err, ssid_len, ie_len = 0;
@@ -3552,6 +3279,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
return err;
if (key.idx >= 0) {
+ if (key.type != -1 && key.type != NL80211_KEYTYPE_GROUP)
+ return -EINVAL;
if (!key.p.key || !key.p.key_len)
return -EINVAL;
if ((key.p.cipher != WLAN_CIPHER_SUITE_WEP40 ||
@@ -3566,34 +3295,31 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
key.p.key = NULL;
}
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
- if (!rdev->ops->auth) {
- err = -EOPNOTSUPP;
- goto out;
+ if (key.idx >= 0) {
+ int i;
+ bool ok = false;
+ for (i = 0; i < rdev->wiphy.n_cipher_suites; i++) {
+ if (key.p.cipher == rdev->wiphy.cipher_suites[i]) {
+ ok = true;
+ break;
+ }
+ }
+ if (!ok)
+ return -EINVAL;
}
- if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (!rdev->ops->auth)
+ return -EOPNOTSUPP;
- if (!netif_running(dev)) {
- err = -ENETDOWN;
- goto out;
- }
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
chan = ieee80211_get_channel(&rdev->wiphy,
nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
- if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED)) {
- err = -EINVAL;
- goto out;
- }
+ if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED))
+ return -EINVAL;
ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
@@ -3604,27 +3330,19 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
}
auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]);
- if (!nl80211_valid_auth_type(auth_type)) {
- err = -EINVAL;
- goto out;
- }
+ if (!nl80211_valid_auth_type(auth_type))
+ return -EINVAL;
local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
- err = cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid,
- ssid, ssid_len, ie, ie_len,
- key.p.key, key.p.key_len, key.idx,
- local_state_change);
-
-out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
-unlock_rtnl:
- rtnl_unlock();
- return err;
+ return cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid,
+ ssid, ssid_len, ie, ie_len,
+ key.p.key, key.p.key_len, key.idx,
+ local_state_change);
}
-static int nl80211_crypto_settings(struct genl_info *info,
+static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
+ struct genl_info *info,
struct cfg80211_crypto_settings *settings,
int cipher_limit)
{
@@ -3632,6 +3350,19 @@ static int nl80211_crypto_settings(struct genl_info *info,
settings->control_port = info->attrs[NL80211_ATTR_CONTROL_PORT];
+ if (info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]) {
+ u16 proto;
+ proto = nla_get_u16(
+ info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]);
+ settings->control_port_ethertype = cpu_to_be16(proto);
+ if (!(rdev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL) &&
+ proto != ETH_P_PAE)
+ return -EINVAL;
+ if (info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT])
+ settings->control_port_no_encrypt = true;
+ } else
+ settings->control_port_ethertype = cpu_to_be16(ETH_P_PAE);
+
if (info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]) {
void *data;
int len, i;
@@ -3691,8 +3422,8 @@ static int nl80211_crypto_settings(struct genl_info *info,
static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
struct cfg80211_crypto_settings crypto;
struct ieee80211_channel *chan;
const u8 *bssid, *ssid, *ie = NULL, *prev_bssid = NULL;
@@ -3707,35 +3438,19 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
!info->attrs[NL80211_ATTR_WIPHY_FREQ])
return -EINVAL;
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
- if (!rdev->ops->assoc) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (!rdev->ops->assoc)
+ return -EOPNOTSUPP;
- if (!netif_running(dev)) {
- err = -ENETDOWN;
- goto out;
- }
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
chan = ieee80211_get_channel(&rdev->wiphy,
nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
- if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED)) {
- err = -EINVAL;
- goto out;
- }
+ if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED))
+ return -EINVAL;
ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
@@ -3750,35 +3465,28 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
nla_get_u32(info->attrs[NL80211_ATTR_USE_MFP]);
if (mfp == NL80211_MFP_REQUIRED)
use_mfp = true;
- else if (mfp != NL80211_MFP_NO) {
- err = -EINVAL;
- goto out;
- }
+ else if (mfp != NL80211_MFP_NO)
+ return -EINVAL;
}
if (info->attrs[NL80211_ATTR_PREV_BSSID])
prev_bssid = nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]);
- err = nl80211_crypto_settings(info, &crypto, 1);
+ err = nl80211_crypto_settings(rdev, info, &crypto, 1);
if (!err)
err = cfg80211_mlme_assoc(rdev, dev, chan, bssid, prev_bssid,
ssid, ssid_len, ie, ie_len, use_mfp,
&crypto);
-out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
-unlock_rtnl:
- rtnl_unlock();
return err;
}
static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
const u8 *ie = NULL, *bssid;
- int err, ie_len = 0;
+ int ie_len = 0;
u16 reason_code;
bool local_state_change;
@@ -3791,34 +3499,19 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
if (!info->attrs[NL80211_ATTR_REASON_CODE])
return -EINVAL;
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
- if (!rdev->ops->deauth) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (!rdev->ops->deauth)
+ return -EOPNOTSUPP;
- if (!netif_running(dev)) {
- err = -ENETDOWN;
- goto out;
- }
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);
if (reason_code == 0) {
/* Reason Code 0 is reserved */
- err = -EINVAL;
- goto out;
+ return -EINVAL;
}
if (info->attrs[NL80211_ATTR_IE]) {
@@ -3828,23 +3521,16 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
- err = cfg80211_mlme_deauth(rdev, dev, bssid, ie, ie_len, reason_code,
- local_state_change);
-
-out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
-unlock_rtnl:
- rtnl_unlock();
- return err;
+ return cfg80211_mlme_deauth(rdev, dev, bssid, ie, ie_len, reason_code,
+ local_state_change);
}
static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
const u8 *ie = NULL, *bssid;
- int err, ie_len = 0;
+ int ie_len = 0;
u16 reason_code;
bool local_state_change;
@@ -3857,34 +3543,19 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
if (!info->attrs[NL80211_ATTR_REASON_CODE])
return -EINVAL;
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
- if (!rdev->ops->disassoc) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (!rdev->ops->disassoc)
+ return -EOPNOTSUPP;
- if (!netif_running(dev)) {
- err = -ENETDOWN;
- goto out;
- }
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);
if (reason_code == 0) {
/* Reason Code 0 is reserved */
- err = -EINVAL;
- goto out;
+ return -EINVAL;
}
if (info->attrs[NL80211_ATTR_IE]) {
@@ -3894,21 +3565,14 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
- err = cfg80211_mlme_disassoc(rdev, dev, bssid, ie, ie_len, reason_code,
- local_state_change);
-
-out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
-unlock_rtnl:
- rtnl_unlock();
- return err;
+ return cfg80211_mlme_disassoc(rdev, dev, bssid, ie, ie_len, reason_code,
+ local_state_change);
}
static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
struct cfg80211_ibss_params ibss;
struct wiphy *wiphy;
struct cfg80211_cached_keys *connkeys = NULL;
@@ -3933,26 +3597,11 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
}
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
- if (!rdev->ops->join_ibss) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (!rdev->ops->join_ibss)
+ return -EOPNOTSUPP;
- if (!netif_running(dev)) {
- err = -ENETDOWN;
- goto out;
- }
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC)
+ return -EOPNOTSUPP;
wiphy = &rdev->wiphy;
@@ -3970,24 +3619,12 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
if (!ibss.channel ||
ibss.channel->flags & IEEE80211_CHAN_NO_IBSS ||
- ibss.channel->flags & IEEE80211_CHAN_DISABLED) {
- err = -EINVAL;
- goto out;
- }
+ ibss.channel->flags & IEEE80211_CHAN_DISABLED)
+ return -EINVAL;
ibss.channel_fixed = !!info->attrs[NL80211_ATTR_FREQ_FIXED];
ibss.privacy = !!info->attrs[NL80211_ATTR_PRIVACY];
- if (ibss.privacy && info->attrs[NL80211_ATTR_KEYS]) {
- connkeys = nl80211_parse_connkeys(rdev,
- info->attrs[NL80211_ATTR_KEYS]);
- if (IS_ERR(connkeys)) {
- err = PTR_ERR(connkeys);
- connkeys = NULL;
- goto out;
- }
- }
-
if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) {
u8 *rates =
nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
@@ -3997,10 +3634,8 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
wiphy->bands[ibss.channel->band];
int i, j;
- if (n_rates == 0) {
- err = -EINVAL;
- goto out;
- }
+ if (n_rates == 0)
+ return -EINVAL;
for (i = 0; i < n_rates; i++) {
int rate = (rates[i] & 0x7f) * 5;
@@ -4013,77 +3648,36 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
break;
}
}
- if (!found) {
- err = -EINVAL;
- goto out;
- }
- }
- } else {
- /*
- * If no rates were explicitly configured,
- * use the mandatory rate set for 11b or
- * 11a for maximum compatibility.
- */
- struct ieee80211_supported_band *sband =
- wiphy->bands[ibss.channel->band];
- int j;
- u32 flag = ibss.channel->band == IEEE80211_BAND_5GHZ ?
- IEEE80211_RATE_MANDATORY_A :
- IEEE80211_RATE_MANDATORY_B;
-
- for (j = 0; j < sband->n_bitrates; j++) {
- if (sband->bitrates[j].flags & flag)
- ibss.basic_rates |= BIT(j);
+ if (!found)
+ return -EINVAL;
}
}
- err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys);
+ if (ibss.privacy && info->attrs[NL80211_ATTR_KEYS]) {
+ connkeys = nl80211_parse_connkeys(rdev,
+ info->attrs[NL80211_ATTR_KEYS]);
+ if (IS_ERR(connkeys))
+ return PTR_ERR(connkeys);
+ }
-out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
-unlock_rtnl:
+ err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys);
if (err)
kfree(connkeys);
- rtnl_unlock();
return err;
}
static int nl80211_leave_ibss(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- struct net_device *dev;
- int err;
-
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
- if (!rdev->ops->leave_ibss) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
- if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- if (!netif_running(dev)) {
- err = -ENETDOWN;
- goto out;
- }
+ if (!rdev->ops->leave_ibss)
+ return -EOPNOTSUPP;
- err = cfg80211_leave_ibss(rdev, dev, false);
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC)
+ return -EOPNOTSUPP;
-out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
-unlock_rtnl:
- rtnl_unlock();
- return err;
+ return cfg80211_leave_ibss(rdev, dev, false);
}
#ifdef CONFIG_NL80211_TESTMODE
@@ -4093,20 +3687,12 @@ static struct genl_multicast_group nl80211_testmode_mcgrp = {
static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
int err;
if (!info->attrs[NL80211_ATTR_TESTDATA])
return -EINVAL;
- rtnl_lock();
-
- rdev = cfg80211_get_dev_from_info(info);
- if (IS_ERR(rdev)) {
- err = PTR_ERR(rdev);
- goto unlock_rtnl;
- }
-
err = -EOPNOTSUPP;
if (rdev->ops->testmode_cmd) {
rdev->testmode_info = info;
@@ -4116,10 +3702,6 @@ static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info)
rdev->testmode_info = NULL;
}
- cfg80211_unlock_rdev(rdev);
-
- unlock_rtnl:
- rtnl_unlock();
return err;
}
@@ -4210,8 +3792,8 @@ EXPORT_SYMBOL(cfg80211_testmode_event);
static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
struct cfg80211_connect_params connect;
struct wiphy *wiphy;
struct cfg80211_cached_keys *connkeys = NULL;
@@ -4236,25 +3818,14 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
connect.privacy = info->attrs[NL80211_ATTR_PRIVACY];
- err = nl80211_crypto_settings(info, &connect.crypto,
+ err = nl80211_crypto_settings(rdev, info, &connect.crypto,
NL80211_MAX_NR_CIPHER_SUITES);
if (err)
return err;
- rtnl_lock();
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
- if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- if (!netif_running(dev)) {
- err = -ENETDOWN;
- goto out;
- }
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
wiphy = &rdev->wiphy;
@@ -4273,39 +3844,27 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
ieee80211_get_channel(wiphy,
nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
if (!connect.channel ||
- connect.channel->flags & IEEE80211_CHAN_DISABLED) {
- err = -EINVAL;
- goto out;
- }
+ connect.channel->flags & IEEE80211_CHAN_DISABLED)
+ return -EINVAL;
}
if (connect.privacy && info->attrs[NL80211_ATTR_KEYS]) {
connkeys = nl80211_parse_connkeys(rdev,
info->attrs[NL80211_ATTR_KEYS]);
- if (IS_ERR(connkeys)) {
- err = PTR_ERR(connkeys);
- connkeys = NULL;
- goto out;
- }
+ if (IS_ERR(connkeys))
+ return PTR_ERR(connkeys);
}
err = cfg80211_connect(rdev, dev, &connect, connkeys);
-
-out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
-unlock_rtnl:
if (err)
kfree(connkeys);
- rtnl_unlock();
return err;
}
static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- struct net_device *dev;
- int err;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
u16 reason;
if (!info->attrs[NL80211_ATTR_REASON_CODE])
@@ -4316,35 +3875,16 @@ static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info)
if (reason == 0)
return -EINVAL;
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
- if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- if (!netif_running(dev)) {
- err = -ENETDOWN;
- goto out;
- }
-
- err = cfg80211_disconnect(rdev, dev, reason, true);
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
-out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
-unlock_rtnl:
- rtnl_unlock();
- return err;
+ return cfg80211_disconnect(rdev, dev, reason, true);
}
static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
struct net *net;
int err;
u32 pid;
@@ -4354,43 +3894,26 @@ static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info)
pid = nla_get_u32(info->attrs[NL80211_ATTR_PID]);
- rtnl_lock();
-
- rdev = cfg80211_get_dev_from_info(info);
- if (IS_ERR(rdev)) {
- err = PTR_ERR(rdev);
- goto out_rtnl;
- }
-
net = get_net_ns_by_pid(pid);
- if (IS_ERR(net)) {
- err = PTR_ERR(net);
- goto out;
- }
+ if (IS_ERR(net))
+ return PTR_ERR(net);
err = 0;
/* check if anything to do */
- if (net_eq(wiphy_net(&rdev->wiphy), net))
- goto out_put_net;
+ if (!net_eq(wiphy_net(&rdev->wiphy), net))
+ err = cfg80211_switch_netns(rdev, net);
- err = cfg80211_switch_netns(rdev, net);
- out_put_net:
put_net(net);
- out:
- cfg80211_unlock_rdev(rdev);
- out_rtnl:
- rtnl_unlock();
return err;
}
static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
int (*rdev_ops)(struct wiphy *wiphy, struct net_device *dev,
struct cfg80211_pmksa *pmksa) = NULL;
- int err;
- struct net_device *dev;
+ struct net_device *dev = info->user_ptr[1];
struct cfg80211_pmksa pmksa;
memset(&pmksa, 0, sizeof(struct cfg80211_pmksa));
@@ -4401,19 +3924,12 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info)
if (!info->attrs[NL80211_ATTR_PMKID])
return -EINVAL;
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto out_rtnl;
-
pmksa.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]);
pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
- if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
switch (info->genlhdr->cmd) {
case NL80211_CMD_SET_PMKSA:
@@ -4427,61 +3943,32 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info)
break;
}
- if (!rdev_ops) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- err = rdev_ops(&rdev->wiphy, dev, &pmksa);
-
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- out_rtnl:
- rtnl_unlock();
+ if (!rdev_ops)
+ return -EOPNOTSUPP;
- return err;
+ return rdev_ops(&rdev->wiphy, dev, &pmksa);
}
static int nl80211_flush_pmksa(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- int err;
- struct net_device *dev;
-
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto out_rtnl;
-
- if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- if (!rdev->ops->flush_pmksa) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- err = rdev->ops->flush_pmksa(&rdev->wiphy, dev);
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- out_rtnl:
- rtnl_unlock();
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
- return err;
+ if (!rdev->ops->flush_pmksa)
+ return -EOPNOTSUPP;
+ return rdev->ops->flush_pmksa(&rdev->wiphy, dev);
}
static int nl80211_remain_on_channel(struct sk_buff *skb,
struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
struct ieee80211_channel *chan;
struct sk_buff *msg;
void *hdr;
@@ -4503,21 +3990,8 @@ static int nl80211_remain_on_channel(struct sk_buff *skb,
if (!duration || !msecs_to_jiffies(duration) || duration > 5000)
return -EINVAL;
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
- if (!rdev->ops->remain_on_channel) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- if (!netif_running(dev)) {
- err = -ENETDOWN;
- goto out;
- }
+ if (!rdev->ops->remain_on_channel)
+ return -EOPNOTSUPP;
if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) {
channel_type = nla_get_u32(
@@ -4525,24 +3999,18 @@ static int nl80211_remain_on_channel(struct sk_buff *skb,
if (channel_type != NL80211_CHAN_NO_HT &&
channel_type != NL80211_CHAN_HT20 &&
channel_type != NL80211_CHAN_HT40PLUS &&
- channel_type != NL80211_CHAN_HT40MINUS) {
- err = -EINVAL;
- goto out;
- }
+ channel_type != NL80211_CHAN_HT40MINUS)
+ return -EINVAL;
}
freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]);
chan = rdev_freq_to_chan(rdev, freq, channel_type);
- if (chan == NULL) {
- err = -EINVAL;
- goto out;
- }
+ if (chan == NULL)
+ return -EINVAL;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg) {
- err = -ENOMEM;
- goto out;
- }
+ if (!msg)
+ return -ENOMEM;
hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
NL80211_CMD_REMAIN_ON_CHANNEL);
@@ -4561,58 +4029,32 @@ static int nl80211_remain_on_channel(struct sk_buff *skb,
NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie);
genlmsg_end(msg, hdr);
- err = genlmsg_reply(msg, info);
- goto out;
+
+ return genlmsg_reply(msg, info);
nla_put_failure:
err = -ENOBUFS;
free_msg:
nlmsg_free(msg);
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- unlock_rtnl:
- rtnl_unlock();
return err;
}
static int nl80211_cancel_remain_on_channel(struct sk_buff *skb,
struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
u64 cookie;
- int err;
if (!info->attrs[NL80211_ATTR_COOKIE])
return -EINVAL;
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
- if (!rdev->ops->cancel_remain_on_channel) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- if (!netif_running(dev)) {
- err = -ENETDOWN;
- goto out;
- }
+ if (!rdev->ops->cancel_remain_on_channel)
+ return -EOPNOTSUPP;
cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]);
- err = rdev->ops->cancel_remain_on_channel(&rdev->wiphy, dev, cookie);
-
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- unlock_rtnl:
- rtnl_unlock();
- return err;
+ return rdev->ops->cancel_remain_on_channel(&rdev->wiphy, dev, cookie);
}
static u32 rateset_to_mask(struct ieee80211_supported_band *sband,
@@ -4648,26 +4090,18 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
struct genl_info *info)
{
struct nlattr *tb[NL80211_TXRATE_MAX + 1];
- struct cfg80211_registered_device *rdev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
struct cfg80211_bitrate_mask mask;
- int err, rem, i;
- struct net_device *dev;
+ int rem, i;
+ struct net_device *dev = info->user_ptr[1];
struct nlattr *tx_rates;
struct ieee80211_supported_band *sband;
if (info->attrs[NL80211_ATTR_TX_RATES] == NULL)
return -EINVAL;
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
- if (!rdev->ops->set_bitrate_mask) {
- err = -EOPNOTSUPP;
- goto unlock;
- }
+ if (!rdev->ops->set_bitrate_mask)
+ return -EOPNOTSUPP;
memset(&mask, 0, sizeof(mask));
/* Default to all rates enabled */
@@ -4684,15 +4118,11 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem)
{
enum ieee80211_band band = nla_type(tx_rates);
- if (band < 0 || band >= IEEE80211_NUM_BANDS) {
- err = -EINVAL;
- goto unlock;
- }
+ if (band < 0 || band >= IEEE80211_NUM_BANDS)
+ return -EINVAL;
sband = rdev->wiphy.bands[band];
- if (sband == NULL) {
- err = -EINVAL;
- goto unlock;
- }
+ if (sband == NULL)
+ return -EINVAL;
nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates),
nla_len(tx_rates), nl80211_txattr_policy);
if (tb[NL80211_TXRATE_LEGACY]) {
@@ -4700,68 +4130,48 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
sband,
nla_data(tb[NL80211_TXRATE_LEGACY]),
nla_len(tb[NL80211_TXRATE_LEGACY]));
- if (mask.control[band].legacy == 0) {
- err = -EINVAL;
- goto unlock;
- }
+ if (mask.control[band].legacy == 0)
+ return -EINVAL;
}
}
- err = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, NULL, &mask);
-
- unlock:
- dev_put(dev);
- cfg80211_unlock_rdev(rdev);
- unlock_rtnl:
- rtnl_unlock();
- return err;
+ return rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, NULL, &mask);
}
-static int nl80211_register_action(struct sk_buff *skb, struct genl_info *info)
+static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- struct net_device *dev;
- int err;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ u16 frame_type = IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION;
if (!info->attrs[NL80211_ATTR_FRAME_MATCH])
return -EINVAL;
- if (nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH]) < 1)
- return -EINVAL;
-
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
+ if (info->attrs[NL80211_ATTR_FRAME_TYPE])
+ frame_type = nla_get_u16(info->attrs[NL80211_ATTR_FRAME_TYPE]);
if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
- dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EOPNOTSUPP;
/* not much point in registering if we can't reply */
- if (!rdev->ops->action) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (!rdev->ops->mgmt_tx)
+ return -EOPNOTSUPP;
- err = cfg80211_mlme_register_action(dev->ieee80211_ptr, info->snd_pid,
+ return cfg80211_mlme_register_mgmt(dev->ieee80211_ptr, info->snd_pid,
+ frame_type,
nla_data(info->attrs[NL80211_ATTR_FRAME_MATCH]),
nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH]));
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- unlock_rtnl:
- rtnl_unlock();
- return err;
}
-static int nl80211_action(struct sk_buff *skb, struct genl_info *info)
+static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
- struct net_device *dev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
struct ieee80211_channel *chan;
enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT;
bool channel_type_valid = false;
@@ -4775,27 +4185,16 @@ static int nl80211_action(struct sk_buff *skb, struct genl_info *info)
!info->attrs[NL80211_ATTR_WIPHY_FREQ])
return -EINVAL;
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
- if (!rdev->ops->action) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (!rdev->ops->mgmt_tx)
+ return -EOPNOTSUPP;
if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
- dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- if (!netif_running(dev)) {
- err = -ENETDOWN;
- goto out;
- }
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EOPNOTSUPP;
if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) {
channel_type = nla_get_u32(
@@ -4803,147 +4202,104 @@ static int nl80211_action(struct sk_buff *skb, struct genl_info *info)
if (channel_type != NL80211_CHAN_NO_HT &&
channel_type != NL80211_CHAN_HT20 &&
channel_type != NL80211_CHAN_HT40PLUS &&
- channel_type != NL80211_CHAN_HT40MINUS) {
- err = -EINVAL;
- goto out;
- }
+ channel_type != NL80211_CHAN_HT40MINUS)
+ return -EINVAL;
channel_type_valid = true;
}
freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]);
chan = rdev_freq_to_chan(rdev, freq, channel_type);
- if (chan == NULL) {
- err = -EINVAL;
- goto out;
- }
+ if (chan == NULL)
+ return -EINVAL;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg) {
- err = -ENOMEM;
- goto out;
- }
+ if (!msg)
+ return -ENOMEM;
hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
- NL80211_CMD_ACTION);
+ NL80211_CMD_FRAME);
if (IS_ERR(hdr)) {
err = PTR_ERR(hdr);
goto free_msg;
}
- err = cfg80211_mlme_action(rdev, dev, chan, channel_type,
- channel_type_valid,
- nla_data(info->attrs[NL80211_ATTR_FRAME]),
- nla_len(info->attrs[NL80211_ATTR_FRAME]),
- &cookie);
+ err = cfg80211_mlme_mgmt_tx(rdev, dev, chan, channel_type,
+ channel_type_valid,
+ nla_data(info->attrs[NL80211_ATTR_FRAME]),
+ nla_len(info->attrs[NL80211_ATTR_FRAME]),
+ &cookie);
if (err)
goto free_msg;
NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie);
genlmsg_end(msg, hdr);
- err = genlmsg_reply(msg, info);
- goto out;
+ return genlmsg_reply(msg, info);
nla_put_failure:
err = -ENOBUFS;
free_msg:
nlmsg_free(msg);
- out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
-unlock_rtnl:
- rtnl_unlock();
return err;
}
static int nl80211_set_power_save(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
struct wireless_dev *wdev;
- struct net_device *dev;
+ struct net_device *dev = info->user_ptr[1];
u8 ps_state;
bool state;
int err;
- if (!info->attrs[NL80211_ATTR_PS_STATE]) {
- err = -EINVAL;
- goto out;
- }
+ if (!info->attrs[NL80211_ATTR_PS_STATE])
+ return -EINVAL;
ps_state = nla_get_u32(info->attrs[NL80211_ATTR_PS_STATE]);
- if (ps_state != NL80211_PS_DISABLED && ps_state != NL80211_PS_ENABLED) {
- err = -EINVAL;
- goto out;
- }
-
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rdev;
+ if (ps_state != NL80211_PS_DISABLED && ps_state != NL80211_PS_ENABLED)
+ return -EINVAL;
wdev = dev->ieee80211_ptr;
- if (!rdev->ops->set_power_mgmt) {
- err = -EOPNOTSUPP;
- goto unlock_rdev;
- }
+ if (!rdev->ops->set_power_mgmt)
+ return -EOPNOTSUPP;
state = (ps_state == NL80211_PS_ENABLED) ? true : false;
if (state == wdev->ps)
- goto unlock_rdev;
-
- wdev->ps = state;
-
- if (rdev->ops->set_power_mgmt(wdev->wiphy, dev, wdev->ps,
- wdev->ps_timeout))
- /* assume this means it's off */
- wdev->ps = false;
-
-unlock_rdev:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- rtnl_unlock();
+ return 0;
-out:
+ err = rdev->ops->set_power_mgmt(wdev->wiphy, dev, state,
+ wdev->ps_timeout);
+ if (!err)
+ wdev->ps = state;
return err;
}
static int nl80211_get_power_save(struct sk_buff *skb, struct genl_info *info)
{
- struct cfg80211_registered_device *rdev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
enum nl80211_ps_state ps_state;
struct wireless_dev *wdev;
- struct net_device *dev;
+ struct net_device *dev = info->user_ptr[1];
struct sk_buff *msg;
void *hdr;
int err;
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rtnl;
-
wdev = dev->ieee80211_ptr;
- if (!rdev->ops->set_power_mgmt) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ if (!rdev->ops->set_power_mgmt)
+ return -EOPNOTSUPP;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg) {
- err = -ENOMEM;
- goto out;
- }
+ if (!msg)
+ return -ENOMEM;
hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
NL80211_CMD_GET_POWER_SAVE);
if (!hdr) {
- err = -ENOMEM;
+ err = -ENOBUFS;
goto free_msg;
}
@@ -4955,22 +4311,12 @@ static int nl80211_get_power_save(struct sk_buff *skb, struct genl_info *info)
NLA_PUT_U32(msg, NL80211_ATTR_PS_STATE, ps_state);
genlmsg_end(msg, hdr);
- err = genlmsg_reply(msg, info);
- goto out;
+ return genlmsg_reply(msg, info);
-nla_put_failure:
+ nla_put_failure:
err = -ENOBUFS;
-
-free_msg:
+ free_msg:
nlmsg_free(msg);
-
-out:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
-
-unlock_rtnl:
- rtnl_unlock();
-
return err;
}
@@ -4984,41 +4330,24 @@ nl80211_attr_cqm_policy[NL80211_ATTR_CQM_MAX + 1] __read_mostly = {
static int nl80211_set_cqm_rssi(struct genl_info *info,
s32 threshold, u32 hysteresis)
{
- struct cfg80211_registered_device *rdev;
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
struct wireless_dev *wdev;
- struct net_device *dev;
- int err;
+ struct net_device *dev = info->user_ptr[1];
if (threshold > 0)
return -EINVAL;
- rtnl_lock();
-
- err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
- if (err)
- goto unlock_rdev;
-
wdev = dev->ieee80211_ptr;
- if (!rdev->ops->set_cqm_rssi_config) {
- err = -EOPNOTSUPP;
- goto unlock_rdev;
- }
-
- if (wdev->iftype != NL80211_IFTYPE_STATION) {
- err = -EOPNOTSUPP;
- goto unlock_rdev;
- }
-
- err = rdev->ops->set_cqm_rssi_config(wdev->wiphy, dev,
- threshold, hysteresis);
+ if (!rdev->ops->set_cqm_rssi_config)
+ return -EOPNOTSUPP;
-unlock_rdev:
- cfg80211_unlock_rdev(rdev);
- dev_put(dev);
- rtnl_unlock();
+ if (wdev->iftype != NL80211_IFTYPE_STATION &&
+ wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
- return err;
+ return rdev->ops->set_cqm_rssi_config(wdev->wiphy, dev,
+ threshold, hysteresis);
}
static int nl80211_set_cqm(struct sk_buff *skb, struct genl_info *info)
@@ -5052,6 +4381,65 @@ out:
return err;
}
+#define NL80211_FLAG_NEED_WIPHY 0x01
+#define NL80211_FLAG_NEED_NETDEV 0x02
+#define NL80211_FLAG_NEED_RTNL 0x04
+#define NL80211_FLAG_CHECK_NETDEV_UP 0x08
+#define NL80211_FLAG_NEED_NETDEV_UP (NL80211_FLAG_NEED_NETDEV |\
+ NL80211_FLAG_CHECK_NETDEV_UP)
+
+static int nl80211_pre_doit(struct genl_ops *ops, struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev;
+ struct net_device *dev;
+ int err;
+ bool rtnl = ops->internal_flags & NL80211_FLAG_NEED_RTNL;
+
+ if (rtnl)
+ rtnl_lock();
+
+ if (ops->internal_flags & NL80211_FLAG_NEED_WIPHY) {
+ rdev = cfg80211_get_dev_from_info(info);
+ if (IS_ERR(rdev)) {
+ if (rtnl)
+ rtnl_unlock();
+ return PTR_ERR(rdev);
+ }
+ info->user_ptr[0] = rdev;
+ } else if (ops->internal_flags & NL80211_FLAG_NEED_NETDEV) {
+ err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
+ if (err) {
+ if (rtnl)
+ rtnl_unlock();
+ return err;
+ }
+ if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP &&
+ !netif_running(dev)) {
+ cfg80211_unlock_rdev(rdev);
+ dev_put(dev);
+ if (rtnl)
+ rtnl_unlock();
+ return -ENETDOWN;
+ }
+ info->user_ptr[0] = rdev;
+ info->user_ptr[1] = dev;
+ }
+
+ return 0;
+}
+
+static void nl80211_post_doit(struct genl_ops *ops, struct sk_buff *skb,
+ struct genl_info *info)
+{
+ if (info->user_ptr[0])
+ cfg80211_unlock_rdev(info->user_ptr[0]);
+ if (info->user_ptr[1])
+ dev_put(info->user_ptr[1]);
+ if (ops->internal_flags & NL80211_FLAG_NEED_RTNL)
+ rtnl_unlock();
+}
+
static struct genl_ops nl80211_ops[] = {
{
.cmd = NL80211_CMD_GET_WIPHY,
@@ -5059,12 +4447,14 @@ static struct genl_ops nl80211_ops[] = {
.dumpit = nl80211_dump_wiphy,
.policy = nl80211_policy,
/* can be retrieved by unprivileged users */
+ .internal_flags = NL80211_FLAG_NEED_WIPHY,
},
{
.cmd = NL80211_CMD_SET_WIPHY,
.doit = nl80211_set_wiphy,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_GET_INTERFACE,
@@ -5072,90 +4462,119 @@ static struct genl_ops nl80211_ops[] = {
.dumpit = nl80211_dump_interface,
.policy = nl80211_policy,
/* can be retrieved by unprivileged users */
+ .internal_flags = NL80211_FLAG_NEED_NETDEV,
},
{
.cmd = NL80211_CMD_SET_INTERFACE,
.doit = nl80211_set_interface,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_NEW_INTERFACE,
.doit = nl80211_new_interface,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WIPHY |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_DEL_INTERFACE,
.doit = nl80211_del_interface,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_GET_KEY,
.doit = nl80211_get_key,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_SET_KEY,
.doit = nl80211_set_key,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_NEW_KEY,
.doit = nl80211_new_key,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_DEL_KEY,
.doit = nl80211_del_key,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_SET_BEACON,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
.doit = nl80211_addset_beacon,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_NEW_BEACON,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
.doit = nl80211_addset_beacon,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_DEL_BEACON,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
.doit = nl80211_del_beacon,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_GET_STATION,
.doit = nl80211_get_station,
.dumpit = nl80211_dump_station,
.policy = nl80211_policy,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_SET_STATION,
.doit = nl80211_set_station,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_NEW_STATION,
.doit = nl80211_new_station,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_DEL_STATION,
.doit = nl80211_del_station,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_GET_MPATH,
@@ -5163,30 +4582,40 @@ static struct genl_ops nl80211_ops[] = {
.dumpit = nl80211_dump_mpath,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_SET_MPATH,
.doit = nl80211_set_mpath,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_NEW_MPATH,
.doit = nl80211_new_mpath,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_DEL_MPATH,
.doit = nl80211_del_mpath,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_SET_BSS,
.doit = nl80211_set_bss,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_GET_REG,
@@ -5211,18 +4640,24 @@ static struct genl_ops nl80211_ops[] = {
.doit = nl80211_get_mesh_params,
.policy = nl80211_policy,
/* can be retrieved by unprivileged users */
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_SET_MESH_PARAMS,
.doit = nl80211_set_mesh_params,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_TRIGGER_SCAN,
.doit = nl80211_trigger_scan,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_GET_SCAN,
@@ -5234,36 +4669,48 @@ static struct genl_ops nl80211_ops[] = {
.doit = nl80211_authenticate,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_ASSOCIATE,
.doit = nl80211_associate,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_DEAUTHENTICATE,
.doit = nl80211_deauthenticate,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_DISASSOCIATE,
.doit = nl80211_disassociate,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_JOIN_IBSS,
.doit = nl80211_join_ibss,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_LEAVE_IBSS,
.doit = nl80211_leave_ibss,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
#ifdef CONFIG_NL80211_TESTMODE
{
@@ -5271,6 +4718,8 @@ static struct genl_ops nl80211_ops[] = {
.doit = nl80211_testmode_do,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WIPHY |
+ NL80211_FLAG_NEED_RTNL,
},
#endif
{
@@ -5278,18 +4727,24 @@ static struct genl_ops nl80211_ops[] = {
.doit = nl80211_connect,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_DISCONNECT,
.doit = nl80211_disconnect,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_SET_WIPHY_NETNS,
.doit = nl80211_wiphy_netns,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WIPHY |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_GET_SURVEY,
@@ -5301,72 +4756,104 @@ static struct genl_ops nl80211_ops[] = {
.doit = nl80211_setdel_pmksa,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_DEL_PMKSA,
.doit = nl80211_setdel_pmksa,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_FLUSH_PMKSA,
.doit = nl80211_flush_pmksa,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_REMAIN_ON_CHANNEL,
.doit = nl80211_remain_on_channel,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL,
.doit = nl80211_cancel_remain_on_channel,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_SET_TX_BITRATE_MASK,
.doit = nl80211_set_tx_bitrate_mask,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
- .cmd = NL80211_CMD_REGISTER_ACTION,
- .doit = nl80211_register_action,
+ .cmd = NL80211_CMD_REGISTER_FRAME,
+ .doit = nl80211_register_mgmt,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
- .cmd = NL80211_CMD_ACTION,
- .doit = nl80211_action,
+ .cmd = NL80211_CMD_FRAME,
+ .doit = nl80211_tx_mgmt,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_SET_POWER_SAVE,
.doit = nl80211_set_power_save,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_GET_POWER_SAVE,
.doit = nl80211_get_power_save,
.policy = nl80211_policy,
/* can be retrieved by unprivileged users */
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_SET_CQM,
.doit = nl80211_set_cqm,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
{
.cmd = NL80211_CMD_SET_CHANNEL,
.doit = nl80211_set_channel,
.policy = nl80211_policy,
.flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL80211_CMD_SET_WDS_PEER,
+ .doit = nl80211_set_wds_peer,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
},
};
@@ -6040,9 +5527,9 @@ void nl80211_send_sta_event(struct cfg80211_registered_device *rdev,
nl80211_mlme_mcgrp.id, gfp);
}
-int nl80211_send_action(struct cfg80211_registered_device *rdev,
- struct net_device *netdev, u32 nlpid,
- int freq, const u8 *buf, size_t len, gfp_t gfp)
+int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, u32 nlpid,
+ int freq, const u8 *buf, size_t len, gfp_t gfp)
{
struct sk_buff *msg;
void *hdr;
@@ -6052,7 +5539,7 @@ int nl80211_send_action(struct cfg80211_registered_device *rdev,
if (!msg)
return -ENOMEM;
- hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_ACTION);
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME);
if (!hdr) {
nlmsg_free(msg);
return -ENOMEM;
@@ -6080,10 +5567,10 @@ int nl80211_send_action(struct cfg80211_registered_device *rdev,
return -ENOBUFS;
}
-void nl80211_send_action_tx_status(struct cfg80211_registered_device *rdev,
- struct net_device *netdev, u64 cookie,
- const u8 *buf, size_t len, bool ack,
- gfp_t gfp)
+void nl80211_send_mgmt_tx_status(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, u64 cookie,
+ const u8 *buf, size_t len, bool ack,
+ gfp_t gfp)
{
struct sk_buff *msg;
void *hdr;
@@ -6092,7 +5579,7 @@ void nl80211_send_action_tx_status(struct cfg80211_registered_device *rdev,
if (!msg)
return;
- hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_ACTION_TX_STATUS);
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME_TX_STATUS);
if (!hdr) {
nlmsg_free(msg);
return;
@@ -6179,7 +5666,7 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list)
list_for_each_entry_rcu(wdev, &rdev->netdev_list, list)
- cfg80211_mlme_unregister_actions(wdev, notify->pid);
+ cfg80211_mlme_unregister_socket(wdev, notify->pid);
rcu_read_unlock();
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 2ad7fbc7d9f..30d2f939150 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -74,13 +74,13 @@ void nl80211_send_sta_event(struct cfg80211_registered_device *rdev,
struct net_device *dev, const u8 *mac_addr,
struct station_info *sinfo, gfp_t gfp);
-int nl80211_send_action(struct cfg80211_registered_device *rdev,
- struct net_device *netdev, u32 nlpid, int freq,
- const u8 *buf, size_t len, gfp_t gfp);
-void nl80211_send_action_tx_status(struct cfg80211_registered_device *rdev,
- struct net_device *netdev, u64 cookie,
- const u8 *buf, size_t len, bool ack,
- gfp_t gfp);
+int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, u32 nlpid, int freq,
+ const u8 *buf, size_t len, gfp_t gfp);
+void nl80211_send_mgmt_tx_status(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, u64 cookie,
+ const u8 *buf, size_t len, bool ack,
+ gfp_t gfp);
void
nl80211_send_cqm_rssi_notify(struct cfg80211_registered_device *rdev,
diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c
index 1332c445d1c..dbe35e138e9 100644
--- a/net/wireless/radiotap.c
+++ b/net/wireless/radiotap.c
@@ -14,6 +14,7 @@
* See COPYING for more details.
*/
+#include <linux/kernel.h>
#include <net/cfg80211.h>
#include <net/ieee80211_radiotap.h>
#include <asm/unaligned.h>
@@ -45,7 +46,7 @@ static const struct radiotap_align_size rtap_namespace_sizes[] = {
};
static const struct ieee80211_radiotap_namespace radiotap_ns = {
- .n_bits = sizeof(rtap_namespace_sizes) / sizeof(rtap_namespace_sizes[0]),
+ .n_bits = ARRAY_SIZE(rtap_namespace_sizes),
.align_size = rtap_namespace_sizes,
};
@@ -200,7 +201,7 @@ int ieee80211_radiotap_iterator_next(
{
while (1) {
int hit = 0;
- int pad, align, size, subns, vnslen;
+ int pad, align, size, subns;
uint32_t oui;
/* if no more EXT bits, that's it */
@@ -260,6 +261,27 @@ int ieee80211_radiotap_iterator_next(
if (pad)
iterator->_arg += align - pad;
+ if (iterator->_arg_index % 32 == IEEE80211_RADIOTAP_VENDOR_NAMESPACE) {
+ int vnslen;
+
+ if ((unsigned long)iterator->_arg + size -
+ (unsigned long)iterator->_rtheader >
+ (unsigned long)iterator->_max_length)
+ return -EINVAL;
+
+ oui = (*iterator->_arg << 16) |
+ (*(iterator->_arg + 1) << 8) |
+ *(iterator->_arg + 2);
+ subns = *(iterator->_arg + 3);
+
+ find_ns(iterator, oui, subns);
+
+ vnslen = get_unaligned_le16(iterator->_arg + 4);
+ iterator->_next_ns_data = iterator->_arg + size + vnslen;
+ if (!iterator->current_namespace)
+ size += vnslen;
+ }
+
/*
* this is what we will return to user, but we need to
* move on first so next call has something fresh to test
@@ -286,40 +308,25 @@ int ieee80211_radiotap_iterator_next(
/* these special ones are valid in each bitmap word */
switch (iterator->_arg_index % 32) {
case IEEE80211_RADIOTAP_VENDOR_NAMESPACE:
- iterator->_bitmap_shifter >>= 1;
- iterator->_arg_index++;
-
iterator->_reset_on_ext = 1;
- vnslen = get_unaligned_le16(iterator->this_arg + 4);
- iterator->_next_ns_data = iterator->_arg + vnslen;
- oui = (*iterator->this_arg << 16) |
- (*(iterator->this_arg + 1) << 8) |
- *(iterator->this_arg + 2);
- subns = *(iterator->this_arg + 3);
-
- find_ns(iterator, oui, subns);
-
iterator->is_radiotap_ns = 0;
- /* allow parsers to show this information */
+ /*
+ * If parser didn't register this vendor
+ * namespace with us, allow it to show it
+ * as 'raw. Do do that, set argument index
+ * to vendor namespace.
+ */
iterator->this_arg_index =
IEEE80211_RADIOTAP_VENDOR_NAMESPACE;
- iterator->this_arg_size += vnslen;
- if ((unsigned long)iterator->this_arg +
- iterator->this_arg_size -
- (unsigned long)iterator->_rtheader >
- (unsigned long)(unsigned long)iterator->_max_length)
- return -EINVAL;
- hit = 1;
- break;
+ if (!iterator->current_namespace)
+ hit = 1;
+ goto next_entry;
case IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE:
- iterator->_bitmap_shifter >>= 1;
- iterator->_arg_index++;
-
iterator->_reset_on_ext = 1;
iterator->current_namespace = &radiotap_ns;
iterator->is_radiotap_ns = 1;
- break;
+ goto next_entry;
case IEEE80211_RADIOTAP_EXT:
/*
* bit 31 was set, there is more
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index f180db0de66..d14bbf960c1 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -36,6 +36,7 @@
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/random.h>
+#include <linux/ctype.h>
#include <linux/nl80211.h>
#include <linux/platform_device.h>
#include <net/cfg80211.h>
@@ -73,7 +74,11 @@ const struct ieee80211_regdomain *cfg80211_regdomain;
* - last_request
*/
static DEFINE_MUTEX(reg_mutex);
-#define assert_reg_lock() WARN_ON(!mutex_is_locked(&reg_mutex))
+
+static inline void assert_reg_lock(void)
+{
+ lockdep_assert_held(&reg_mutex);
+}
/* Used to queue up regulatory hints */
static LIST_HEAD(reg_requests_list);
@@ -181,14 +186,6 @@ static bool is_alpha2_set(const char *alpha2)
return false;
}
-static bool is_alpha_upper(char letter)
-{
- /* ASCII A - Z */
- if (letter >= 65 && letter <= 90)
- return true;
- return false;
-}
-
static bool is_unknown_alpha2(const char *alpha2)
{
if (!alpha2)
@@ -220,7 +217,7 @@ static bool is_an_alpha2(const char *alpha2)
{
if (!alpha2)
return false;
- if (is_alpha_upper(alpha2[0]) && is_alpha_upper(alpha2[1]))
+ if (isalpha(alpha2[0]) && isalpha(alpha2[1]))
return true;
return false;
}
@@ -1399,6 +1396,11 @@ static DECLARE_WORK(reg_work, reg_todo);
static void queue_regulatory_request(struct regulatory_request *request)
{
+ if (isalpha(request->alpha2[0]))
+ request->alpha2[0] = toupper(request->alpha2[0]);
+ if (isalpha(request->alpha2[1]))
+ request->alpha2[1] = toupper(request->alpha2[1]);
+
spin_lock(&reg_requests_lock);
list_add_tail(&request->list, &reg_requests_list);
spin_unlock(&reg_requests_lock);
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 5ca8c718014..503ebb86ba1 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -650,14 +650,14 @@ void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *pub)
bss = container_of(pub, struct cfg80211_internal_bss, pub);
spin_lock_bh(&dev->bss_lock);
+ if (!list_empty(&bss->list)) {
+ list_del_init(&bss->list);
+ dev->bss_generation++;
+ rb_erase(&bss->rbn, &dev->bss_tree);
- list_del(&bss->list);
- dev->bss_generation++;
- rb_erase(&bss->rbn, &dev->bss_tree);
-
+ kref_put(&bss->ref, bss_release);
+ }
spin_unlock_bh(&dev->bss_lock);
-
- kref_put(&bss->ref, bss_release);
}
EXPORT_SYMBOL(cfg80211_unlink_bss);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index a8c2d6b877a..e17b0bee6bd 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -411,7 +411,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
ASSERT_WDEV_LOCK(wdev);
- if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
+ wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
return;
if (wdev->sme_state != CFG80211_SME_CONNECTING)
@@ -548,7 +549,8 @@ void __cfg80211_roamed(struct wireless_dev *wdev, const u8 *bssid,
ASSERT_WDEV_LOCK(wdev);
- if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
+ wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
return;
if (wdev->sme_state != CFG80211_SME_CONNECTED)
@@ -644,7 +646,8 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
ASSERT_WDEV_LOCK(wdev);
- if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
+ wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
return;
if (wdev->sme_state != CFG80211_SME_CONNECTED)
@@ -695,7 +698,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
*/
if (rdev->ops->del_key)
for (i = 0; i < 6; i++)
- rdev->ops->del_key(wdev->wiphy, dev, i, NULL);
+ rdev->ops->del_key(wdev->wiphy, dev, i, false, NULL);
#ifdef CONFIG_CFG80211_WEXT
memset(&wrqu, 0, sizeof(wrqu));
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index 9f2cef3e0ca..4294fa22bb2 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -35,6 +35,14 @@ SHOW_FMT(index, "%d", wiphy_idx);
SHOW_FMT(macaddress, "%pM", wiphy.perm_addr);
SHOW_FMT(address_mask, "%pM", wiphy.addr_mask);
+static ssize_t name_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf) {
+ struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy;
+ return sprintf(buf, "%s\n", dev_name(&wiphy->dev));
+}
+
+
static ssize_t addresses_show(struct device *dev,
struct device_attribute *attr,
char *buf)
@@ -57,6 +65,7 @@ static struct device_attribute ieee80211_dev_attrs[] = {
__ATTR_RO(macaddress),
__ATTR_RO(address_mask),
__ATTR_RO(addresses),
+ __ATTR_RO(name),
{}
};
@@ -110,6 +119,13 @@ static int wiphy_resume(struct device *dev)
return ret;
}
+static const void *wiphy_namespace(struct device *d)
+{
+ struct wiphy *wiphy = container_of(d, struct wiphy, dev);
+
+ return wiphy_net(wiphy);
+}
+
struct class ieee80211_class = {
.name = "ieee80211",
.owner = THIS_MODULE,
@@ -120,6 +136,8 @@ struct class ieee80211_class = {
#endif
.suspend = wiphy_suspend,
.resume = wiphy_resume,
+ .ns_type = &net_ns_type_operations,
+ .namespace = wiphy_namespace,
};
int wiphy_sysfs_init(void)
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 0c8a1e8b769..76120aeda57 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -144,19 +144,25 @@ void ieee80211_set_bitrate_flags(struct wiphy *wiphy)
int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
struct key_params *params, int key_idx,
- const u8 *mac_addr)
+ bool pairwise, const u8 *mac_addr)
{
int i;
if (key_idx > 5)
return -EINVAL;
+ if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
+ return -EINVAL;
+
+ if (pairwise && !mac_addr)
+ return -EINVAL;
+
/*
* Disallow pairwise keys with non-zero index unless it's WEP
* (because current deployments use pairwise WEP keys with
* non-zero indizes but 802.11i clearly specifies to use zero)
*/
- if (mac_addr && key_idx &&
+ if (pairwise && key_idx &&
params->cipher != WLAN_CIPHER_SUITE_WEP40 &&
params->cipher != WLAN_CIPHER_SUITE_WEP104)
return -EINVAL;
@@ -183,7 +189,14 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
return -EINVAL;
break;
default:
- return -EINVAL;
+ /*
+ * We don't know anything about this algorithm,
+ * allow using it -- but the driver must check
+ * all parameters! We still check below whether
+ * or not the driver supports this algorithm,
+ * of course.
+ */
+ break;
}
if (params->seq) {
@@ -221,7 +234,7 @@ const unsigned char bridge_tunnel_header[] __aligned(2) =
{ 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 };
EXPORT_SYMBOL(bridge_tunnel_header);
-unsigned int ieee80211_hdrlen(__le16 fc)
+unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc)
{
unsigned int hdrlen = 24;
@@ -319,7 +332,8 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) {
case cpu_to_le16(IEEE80211_FCTL_TODS):
if (unlikely(iftype != NL80211_IFTYPE_AP &&
- iftype != NL80211_IFTYPE_AP_VLAN))
+ iftype != NL80211_IFTYPE_AP_VLAN &&
+ iftype != NL80211_IFTYPE_P2P_GO))
return -1;
break;
case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS):
@@ -347,7 +361,8 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
break;
case cpu_to_le16(IEEE80211_FCTL_FROMDS):
if ((iftype != NL80211_IFTYPE_STATION &&
- iftype != NL80211_IFTYPE_MESH_POINT) ||
+ iftype != NL80211_IFTYPE_P2P_CLIENT &&
+ iftype != NL80211_IFTYPE_MESH_POINT) ||
(is_multicast_ether_addr(dst) &&
!compare_ether_addr(src, addr)))
return -1;
@@ -424,6 +439,7 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
switch (iftype) {
case NL80211_IFTYPE_AP:
case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_P2P_GO:
fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS);
/* DA BSSID SA */
memcpy(hdr.addr1, skb->data, ETH_ALEN);
@@ -432,6 +448,7 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
hdrlen = 24;
break;
case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_P2P_CLIENT:
fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
/* BSSID SA DA */
memcpy(hdr.addr1, bssid, ETH_ALEN);
@@ -666,7 +683,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
for (i = 0; i < 6; i++) {
if (!wdev->connect_keys->params[i].cipher)
continue;
- if (rdev->ops->add_key(wdev->wiphy, dev, i, NULL,
+ if (rdev->ops->add_key(wdev->wiphy, dev, i, false, NULL,
&wdev->connect_keys->params[i])) {
printk(KERN_ERR "%s: failed to set key %d\n",
dev->name, i);
@@ -771,7 +788,9 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
/* if it's part of a bridge, reject changing type to station/ibss */
if ((dev->priv_flags & IFF_BRIDGE_PORT) &&
- (ntype == NL80211_IFTYPE_ADHOC || ntype == NL80211_IFTYPE_STATION))
+ (ntype == NL80211_IFTYPE_ADHOC ||
+ ntype == NL80211_IFTYPE_STATION ||
+ ntype == NL80211_IFTYPE_P2P_CLIENT))
return -EBUSY;
if (ntype != otype) {
@@ -782,6 +801,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
cfg80211_leave_ibss(rdev, dev, false);
break;
case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_P2P_CLIENT:
cfg80211_disconnect(rdev, dev,
WLAN_REASON_DEAUTH_LEAVING, true);
break;
@@ -810,9 +830,11 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
if (dev->ieee80211_ptr->use_4addr)
break;
/* fall through */
+ case NL80211_IFTYPE_P2P_CLIENT:
case NL80211_IFTYPE_ADHOC:
dev->priv_flags |= IFF_DONT_BRIDGE;
break;
+ case NL80211_IFTYPE_P2P_GO:
case NL80211_IFTYPE_AP:
case NL80211_IFTYPE_AP_VLAN:
case NL80211_IFTYPE_WDS:
@@ -823,7 +845,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
/* monitor can't bridge anyway */
break;
case NL80211_IFTYPE_UNSPECIFIED:
- case __NL80211_IFTYPE_AFTER_LAST:
+ case NUM_NL80211_IFTYPES:
/* not happening */
break;
}
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index bb5e0a5ecfa..12222ee6ebf 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -432,14 +432,17 @@ int cfg80211_wext_giwretry(struct net_device *dev,
EXPORT_SYMBOL_GPL(cfg80211_wext_giwretry);
static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
- struct net_device *dev, const u8 *addr,
- bool remove, bool tx_key, int idx,
- struct key_params *params)
+ struct net_device *dev, bool pairwise,
+ const u8 *addr, bool remove, bool tx_key,
+ int idx, struct key_params *params)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
int err, i;
bool rejoin = false;
+ if (pairwise && !addr)
+ return -EINVAL;
+
if (!wdev->wext.keys) {
wdev->wext.keys = kzalloc(sizeof(*wdev->wext.keys),
GFP_KERNEL);
@@ -478,7 +481,13 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
__cfg80211_leave_ibss(rdev, wdev->netdev, true);
rejoin = true;
}
- err = rdev->ops->del_key(&rdev->wiphy, dev, idx, addr);
+
+ if (!pairwise && addr &&
+ !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
+ err = -ENOENT;
+ else
+ err = rdev->ops->del_key(&rdev->wiphy, dev, idx,
+ pairwise, addr);
}
wdev->wext.connect.privacy = false;
/*
@@ -507,12 +516,13 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
if (addr)
tx_key = false;
- if (cfg80211_validate_key_settings(rdev, params, idx, addr))
+ if (cfg80211_validate_key_settings(rdev, params, idx, pairwise, addr))
return -EINVAL;
err = 0;
if (wdev->current_bss)
- err = rdev->ops->add_key(&rdev->wiphy, dev, idx, addr, params);
+ err = rdev->ops->add_key(&rdev->wiphy, dev, idx,
+ pairwise, addr, params);
if (err)
return err;
@@ -563,17 +573,17 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
}
static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
- struct net_device *dev, const u8 *addr,
- bool remove, bool tx_key, int idx,
- struct key_params *params)
+ struct net_device *dev, bool pairwise,
+ const u8 *addr, bool remove, bool tx_key,
+ int idx, struct key_params *params)
{
int err;
/* devlist mutex needed for possible IBSS re-join */
mutex_lock(&rdev->devlist_mtx);
wdev_lock(dev->ieee80211_ptr);
- err = __cfg80211_set_encryption(rdev, dev, addr, remove,
- tx_key, idx, params);
+ err = __cfg80211_set_encryption(rdev, dev, pairwise, addr,
+ remove, tx_key, idx, params);
wdev_unlock(dev->ieee80211_ptr);
mutex_unlock(&rdev->devlist_mtx);
@@ -635,7 +645,7 @@ int cfg80211_wext_siwencode(struct net_device *dev,
else if (!remove)
return -EINVAL;
- return cfg80211_set_encryption(rdev, dev, NULL, remove,
+ return cfg80211_set_encryption(rdev, dev, false, NULL, remove,
wdev->wext.default_key == -1,
idx, &params);
}
@@ -725,7 +735,9 @@ int cfg80211_wext_siwencodeext(struct net_device *dev,
}
return cfg80211_set_encryption(
- rdev, dev, addr, remove,
+ rdev, dev,
+ !(ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY),
+ addr, remove,
ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY,
idx, &params);
}
@@ -1354,6 +1366,10 @@ struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
}
wstats.qual.updated |= IW_QUAL_NOISE_INVALID;
+ if (sinfo.filled & STATION_INFO_RX_DROP_MISC)
+ wstats.discard.misc = sinfo.rx_dropped_misc;
+ if (sinfo.filled & STATION_INFO_TX_FAILED)
+ wstats.discard.retries = sinfo.tx_failed;
return &wstats;
}
@@ -1420,6 +1436,9 @@ int cfg80211_wext_giwessid(struct net_device *dev,
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
+ data->flags = 0;
+ data->length = 0;
+
switch (wdev->iftype) {
case NL80211_IFTYPE_ADHOC:
return cfg80211_ibss_wext_giwessid(dev, info, data, ssid);
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 0ef17bc42ba..dc675a3daa3 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -611,7 +611,7 @@ struct iw_statistics *get_wireless_stats(struct net_device *dev)
#endif
#ifdef CONFIG_CFG80211_WEXT
- if (dev->ieee80211_ptr && dev->ieee80211_ptr &&
+ if (dev->ieee80211_ptr &&
dev->ieee80211_ptr->wiphy &&
dev->ieee80211_ptr->wiphy->wext &&
dev->ieee80211_ptr->wiphy->wext->get_wireless_stats)
@@ -782,6 +782,22 @@ static int ioctl_standard_iw_point(struct iw_point *iwp, unsigned int cmd,
}
}
+ if (IW_IS_GET(cmd) && !(descr->flags & IW_DESCR_FLAG_NOMAX)) {
+ /*
+ * If this is a GET, but not NOMAX, it means that the extra
+ * data is not bounded by userspace, but by max_tokens. Thus
+ * set the length to max_tokens. This matches the extra data
+ * allocation.
+ * The driver should fill it with the number of tokens it
+ * provided, and it may check iwp->length rather than having
+ * knowledge of max_tokens. If the driver doesn't change the
+ * iwp->length, this ioctl just copies back max_token tokens
+ * filled with zeroes. Hopefully the driver isn't claiming
+ * them to be valid data.
+ */
+ iwp->length = descr->max_tokens;
+ }
+
err = handler(dev, info, (union iwreq_data *) iwp, extra);
iwp->length += essid_compat;
diff --git a/net/wireless/wext-priv.c b/net/wireless/wext-priv.c
index 3feb28e41c5..674d426a9d2 100644
--- a/net/wireless/wext-priv.c
+++ b/net/wireless/wext-priv.c
@@ -152,7 +152,7 @@ static int ioctl_private_iw_point(struct iw_point *iwp, unsigned int cmd,
} else if (!iwp->pointer)
return -EFAULT;
- extra = kmalloc(extra_size, GFP_KERNEL);
+ extra = kzalloc(extra_size, GFP_KERNEL);
if (!extra)
return -ENOMEM;
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index 9818198add8..6fffe62d7c2 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -197,6 +197,8 @@ int cfg80211_mgd_wext_siwessid(struct net_device *dev,
wdev->wext.connect.ssid_len = len;
wdev->wext.connect.crypto.control_port = false;
+ wdev->wext.connect.crypto.control_port_ethertype =
+ cpu_to_be16(ETH_P_PAE);
err = cfg80211_mgd_wext_connect(rdev, wdev);
out:
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
index e6759c9660b..2196e55e4f6 100644
--- a/net/x25/Kconfig
+++ b/net/x25/Kconfig
@@ -5,6 +5,7 @@
config X25
tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
depends on EXPERIMENTAL
+ depends on BKL # should be fixable
---help---
X.25 is a set of standardized network protocols, similar in scope to
frame relay; the one physical line from your box to the X.25 network
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 5e86d4e97dc..f7af98dff40 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -507,14 +507,14 @@ static int x25_listen(struct socket *sock, int backlog)
struct sock *sk = sock->sk;
int rc = -EOPNOTSUPP;
- lock_kernel();
+ lock_sock(sk);
if (sk->sk_state != TCP_LISTEN) {
memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN);
sk->sk_max_ack_backlog = backlog;
sk->sk_state = TCP_LISTEN;
rc = 0;
}
- unlock_kernel();
+ release_sock(sk);
return rc;
}
@@ -688,7 +688,6 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr;
int len, i, rc = 0;
- lock_kernel();
if (!sock_flag(sk, SOCK_ZAPPED) ||
addr_len != sizeof(struct sockaddr_x25) ||
addr->sx25_family != AF_X25) {
@@ -704,12 +703,13 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
}
}
+ lock_sock(sk);
x25_sk(sk)->source_addr = addr->sx25_addr;
x25_insert_socket(sk);
sock_reset_flag(sk, SOCK_ZAPPED);
+ release_sock(sk);
SOCK_DEBUG(sk, "x25_bind: socket is bound\n");
out:
- unlock_kernel();
return rc;
}
@@ -751,7 +751,6 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr,
struct x25_route *rt;
int rc = 0;
- lock_kernel();
lock_sock(sk);
if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) {
sock->state = SS_CONNECTED;
@@ -829,7 +828,6 @@ out_put_route:
x25_route_put(rt);
out:
release_sock(sk);
- unlock_kernel();
return rc;
}
@@ -869,8 +867,7 @@ static int x25_accept(struct socket *sock, struct socket *newsock, int flags)
struct sk_buff *skb;
int rc = -EINVAL;
- lock_kernel();
- if (!sk || sk->sk_state != TCP_LISTEN)
+ if (!sk)
goto out;
rc = -EOPNOTSUPP;
@@ -878,6 +875,10 @@ static int x25_accept(struct socket *sock, struct socket *newsock, int flags)
goto out;
lock_sock(sk);
+ rc = -EINVAL;
+ if (sk->sk_state != TCP_LISTEN)
+ goto out2;
+
rc = x25_wait_for_data(sk, sk->sk_rcvtimeo);
if (rc)
goto out2;
@@ -897,7 +898,6 @@ static int x25_accept(struct socket *sock, struct socket *newsock, int flags)
out2:
release_sock(sk);
out:
- unlock_kernel();
return rc;
}
@@ -909,7 +909,6 @@ static int x25_getname(struct socket *sock, struct sockaddr *uaddr,
struct x25_sock *x25 = x25_sk(sk);
int rc = 0;
- lock_kernel();
if (peer) {
if (sk->sk_state != TCP_ESTABLISHED) {
rc = -ENOTCONN;
@@ -923,19 +922,6 @@ static int x25_getname(struct socket *sock, struct sockaddr *uaddr,
*uaddr_len = sizeof(*sx25);
out:
- unlock_kernel();
- return rc;
-}
-
-static unsigned int x25_datagram_poll(struct file *file, struct socket *sock,
- poll_table *wait)
-{
- int rc;
-
- lock_kernel();
- rc = datagram_poll(file, sock, wait);
- unlock_kernel();
-
return rc;
}
@@ -1746,7 +1732,7 @@ static const struct proto_ops x25_proto_ops = {
.socketpair = sock_no_socketpair,
.accept = x25_accept,
.getname = x25_getname,
- .poll = x25_datagram_poll,
+ .poll = datagram_poll,
.ioctl = x25_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_x25_ioctl,
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index a3cca0a9434..64f2ae1fdc1 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -101,7 +101,7 @@ resume:
err = -EHOSTUNREACH;
goto error_nolock;
}
- skb_dst_set_noref(skb, dst);
+ skb_dst_set(skb, dst_clone(dst));
x = dst->xfrm;
} while (x && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL));
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 2b3ed7ad493..044e7789851 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -50,6 +50,9 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
static void xfrm_init_pmtu(struct dst_entry *dst);
static int stale_bundle(struct dst_entry *dst);
+static int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *xdst,
+ struct flowi *fl, int family, int strict);
+
static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
int dir);
@@ -1175,9 +1178,8 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, struct flowi *fl,
tmpl->mode == XFRM_MODE_BEET) {
remote = &tmpl->id.daddr;
local = &tmpl->saddr;
- family = tmpl->encap_family;
- if (xfrm_addr_any(local, family)) {
- error = xfrm_get_saddr(net, &tmp, remote, family);
+ if (xfrm_addr_any(local, tmpl->encap_family)) {
+ error = xfrm_get_saddr(net, &tmp, remote, tmpl->encap_family);
if (error)
goto fail;
local = &tmp;
@@ -2277,7 +2279,7 @@ static void xfrm_init_pmtu(struct dst_entry *dst)
* still valid.
*/
-int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
+static int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
struct flowi *fl, int family, int strict)
{
struct dst_entry *dst = &first->u.dst;
@@ -2359,8 +2361,6 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
return 1;
}
-EXPORT_SYMBOL(xfrm_bundle_ok);
-
int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
{
struct net *net;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 5208b12fbfb..eb96ce52f17 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -656,15 +656,23 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
EXPORT_SYMBOL(xfrm_sad_getinfo);
static int
-xfrm_init_tempsel(struct xfrm_state *x, struct flowi *fl,
- struct xfrm_tmpl *tmpl,
- xfrm_address_t *daddr, xfrm_address_t *saddr,
- unsigned short family)
+xfrm_init_tempstate(struct xfrm_state *x, struct flowi *fl,
+ struct xfrm_tmpl *tmpl,
+ xfrm_address_t *daddr, xfrm_address_t *saddr,
+ unsigned short family)
{
struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
if (!afinfo)
return -1;
- afinfo->init_tempsel(x, fl, tmpl, daddr, saddr);
+ afinfo->init_tempsel(&x->sel, fl);
+
+ if (family != tmpl->encap_family) {
+ xfrm_state_put_afinfo(afinfo);
+ afinfo = xfrm_state_get_afinfo(tmpl->encap_family);
+ if (!afinfo)
+ return -1;
+ }
+ afinfo->init_temprop(x, tmpl, daddr, saddr);
xfrm_state_put_afinfo(afinfo);
return 0;
}
@@ -790,37 +798,38 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
int error = 0;
struct xfrm_state *best = NULL;
u32 mark = pol->mark.v & pol->mark.m;
+ unsigned short encap_family = tmpl->encap_family;
to_put = NULL;
spin_lock_bh(&xfrm_state_lock);
- h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, family);
+ h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family);
hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) {
- if (x->props.family == family &&
+ if (x->props.family == encap_family &&
x->props.reqid == tmpl->reqid &&
(mark & x->mark.m) == x->mark.v &&
!(x->props.flags & XFRM_STATE_WILDRECV) &&
- xfrm_state_addr_check(x, daddr, saddr, family) &&
+ xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
tmpl->mode == x->props.mode &&
tmpl->id.proto == x->id.proto &&
(tmpl->id.spi == x->id.spi || !tmpl->id.spi))
- xfrm_state_look_at(pol, x, fl, family, daddr, saddr,
+ xfrm_state_look_at(pol, x, fl, encap_family, daddr, saddr,
&best, &acquire_in_progress, &error);
}
if (best)
goto found;
- h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, family);
+ h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family);
hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h_wildcard, bydst) {
- if (x->props.family == family &&
+ if (x->props.family == encap_family &&
x->props.reqid == tmpl->reqid &&
(mark & x->mark.m) == x->mark.v &&
!(x->props.flags & XFRM_STATE_WILDRECV) &&
- xfrm_state_addr_check(x, daddr, saddr, family) &&
+ xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
tmpl->mode == x->props.mode &&
tmpl->id.proto == x->id.proto &&
(tmpl->id.spi == x->id.spi || !tmpl->id.spi))
- xfrm_state_look_at(pol, x, fl, family, daddr, saddr,
+ xfrm_state_look_at(pol, x, fl, encap_family, daddr, saddr,
&best, &acquire_in_progress, &error);
}
@@ -829,7 +838,7 @@ found:
if (!x && !error && !acquire_in_progress) {
if (tmpl->id.spi &&
(x0 = __xfrm_state_lookup(net, mark, daddr, tmpl->id.spi,
- tmpl->id.proto, family)) != NULL) {
+ tmpl->id.proto, encap_family)) != NULL) {
to_put = x0;
error = -EEXIST;
goto out;
@@ -839,9 +848,9 @@ found:
error = -ENOMEM;
goto out;
}
- /* Initialize temporary selector matching only
+ /* Initialize temporary state matching only
* to current session. */
- xfrm_init_tempsel(x, fl, tmpl, daddr, saddr, family);
+ xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family);
memcpy(&x->mark, &pol->mark, sizeof(x->mark));
error = security_xfrm_state_alloc_acquire(x, pol->security, fl->secid);
@@ -856,10 +865,10 @@ found:
x->km.state = XFRM_STATE_ACQ;
list_add(&x->km.all, &net->xfrm.state_all);
hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
- h = xfrm_src_hash(net, daddr, saddr, family);
+ h = xfrm_src_hash(net, daddr, saddr, encap_family);
hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
if (x->id.spi) {
- h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, family);
+ h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family);
hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
}
x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index ba59983aaff..8bae6b22c84 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1801,7 +1801,7 @@ static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
struct xfrm_user_expire *ue = nlmsg_data(nlh);
struct xfrm_usersa_info *p = &ue->state;
struct xfrm_mark m;
- u32 mark = xfrm_mark_get(attrs, &m);;
+ u32 mark = xfrm_mark_get(attrs, &m);
x = xfrm_state_lookup(net, mark, &p->id.daddr, p->id.spi, p->id.proto, p->family);
@@ -2504,7 +2504,7 @@ static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt,
if (p->dir > XFRM_POLICY_OUT)
return NULL;
- xp = xfrm_policy_alloc(net, GFP_KERNEL);
+ xp = xfrm_policy_alloc(net, GFP_ATOMIC);
if (xp == NULL) {
*dir = -ENOBUFS;
return NULL;