[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20120911123746.4305.26241.stgit@dragon>
Date: Tue, 11 Sep 2012 14:38:22 +0200
From: Jesper Dangaard Brouer <brouer@...hat.com>
To: Hans Schillstrom <hans@...illstrom.com>,
Hans Schillstrom <hans.schillstrom@...csson.com>,
netdev@...r.kernel.org, "Patrick McHardy" <kaber@...sh.net>,
Pablo Neira Ayuso <pablo@...filter.org>,
lvs-devel@...r.kernel.org, Julian Anastasov <ja@....bg>
Cc: Jesper Dangaard Brouer <brouer@...hat.com>,
Thomas Graf <tgraf@...g.ch>,
Wensong Zhang <wensong@...ux-vs.org>,
netfilter-devel@...r.kernel.org, Simon Horman <horms@...ge.net.au>
Subject: [PATCH V3 6/8] ipvs: Complete IPv6 fragment handling for IPVS
IPVS now supports fragmented packets, with support from nf_conntrack_reasm.c
Based on patch from: Hans Schillstrom.
IPVS do like conntrack i.e. use the skb->nfct_reasm
(i.e. when all fragments is collected, nf_ct_frag6_output()
starts a "re-play" of all fragments into the interrupted
PREROUTING chain at prio -399 (NF_IP6_PRI_CONNTRACK_DEFRAG+1)
with nfct_reasm pointing to the assembled packet.)
Notice, module nf_defrag_ipv6 must be loaded for this to work.
Report unhandled fragments, and recommend user to load nf_defrag_ipv6.
To handle fw-mark for fragments. Add a new IPVS hook into prerouting
chain at prio -99 (NF_IP6_PRI_NAT_DST+1) to catch fragments, and copy
fw-mark info from the first packet with an upper layer header.
IPv6 fragment handling should be the last thing on the IPVS IPv6
missing support list.
Signed-off-by: Jesper Dangaard Brouer <brouer@...hat.com>
Signed-off-by: Hans Schillstrom <hans@...illstrom.com>
---
V3:
- In case of no nf_defrag_ipv6, second fragments could create
strange conn entries, fix that.
- Report unhandled fragments, and recommend user to load nf_defrag_ipv6.
- Move ICMPv6 improvements to seperate patch
V2:
- In ip_vs_in_icmp_v6() hint that &ciph.len can be updated
- Add NULL pointer check in ip_vs_in_icmp_v6()
V1:
- Fixed refcnt bug since last.
include/net/ip_vs.h | 39 +++++++++++++
net/netfilter/ipvs/Kconfig | 6 +-
net/netfilter/ipvs/ip_vs_conn.c | 2 -
net/netfilter/ipvs/ip_vs_core.c | 117 ++++++++++++++++++++++++++++++++-------
net/netfilter/ipvs/ip_vs_xmit.c | 33 ++++++++---
5 files changed, 162 insertions(+), 35 deletions(-)
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 29265bf..98806b6 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -109,6 +109,7 @@ extern int ip_vs_conn_tab_size;
struct ip_vs_iphdr {
__u32 len; /* IPv4 simply where L4 starts
IPv6 where L4 Transport Header starts */
+ __u32 thoff_reasm; /* Transport Header Offset in nfct_reasm skb */
__u16 fragoffs; /* IPv6 fragment offset, 0 if first frag (or not frag)*/
__s16 protocol;
__s32 flags;
@@ -116,6 +117,35 @@ struct ip_vs_iphdr {
union nf_inet_addr daddr;
};
+/* Dependency to module: nf_defrag_ipv6 */
+#if defined(CONFIG_NF_DEFRAG_IPV6) || defined(CONFIG_NF_DEFRAG_IPV6_MODULE)
+static inline struct sk_buff *skb_nfct_reasm(const struct sk_buff *skb)
+{
+ return skb->nfct_reasm;
+}
+static inline void *frag_safe_skb_hp(const struct sk_buff *skb, int offset,
+ int len, void *buffer,
+ const struct ip_vs_iphdr *ipvsh)
+{
+ if (unlikely(ipvsh->fragoffs && skb_nfct_reasm(skb)))
+ return skb_header_pointer(skb_nfct_reasm(skb),
+ ipvsh->thoff_reasm, len, buffer);
+
+ return skb_header_pointer(skb, offset, len, buffer);
+}
+#else
+static inline struct sk_buff *skb_nfct_reasm(const struct sk_buff *skb)
+{
+ return NULL;
+}
+static inline void *frag_safe_skb_hp(const struct sk_buff *skb, int offset,
+ int len, void *buffer,
+ const struct ip_vs_iphdr *ipvsh)
+{
+ return skb_header_pointer(skb, offset, len, buffer);
+}
+#endif
+
static inline void
ip_vs_fill_ip4hdr(const void *nh, struct ip_vs_iphdr *iphdr)
{
@@ -141,12 +171,19 @@ ip_vs_fill_iph_skb(int af, const struct sk_buff *skb, struct ip_vs_iphdr *iphdr)
(struct ipv6hdr *)skb_network_header(skb);
iphdr->saddr.in6 = iph->saddr;
iphdr->daddr.in6 = iph->daddr;
- /* ipv6_find_hdr() updates len, flags */
+ /* ipv6_find_hdr() updates len, flags, thoff_reasm */
+ iphdr->thoff_reasm = 0;
iphdr->len = 0;
iphdr->flags = 0;
iphdr->protocol = ipv6_find_hdr(skb, &iphdr->len, -1,
&iphdr->fragoffs,
&iphdr->flags);
+ /* get proto from re-assembled packet and it's offset */
+ if (skb_nfct_reasm(skb))
+ iphdr->protocol = ipv6_find_hdr(skb_nfct_reasm(skb),
+ &iphdr->thoff_reasm,
+ -1, NULL, NULL);
+
} else
#endif
{
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index a97ae53..0c3b167 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -30,11 +30,9 @@ config IP_VS_IPV6
depends on IPV6 = y || IP_VS = IPV6
select IP6_NF_IPTABLES
---help---
- Add IPv6 support to IPVS. This is incomplete and might be dangerous.
+ Add IPv6 support to IPVS.
- See http://www.mindbasket.com/ipvs for more information.
-
- Say N if unsure.
+ Say Y if unsure.
config IP_VS_DEBUG
bool "IP virtual server debugging"
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 1548df9..d6c1c26 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -314,7 +314,7 @@ ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
__be16 _ports[2], *pptr;
struct net *net = skb_net(skb);
- pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+ pptr = frag_safe_skb_hp(skb, proto_off, sizeof(_ports), _ports, iph);
if (pptr == NULL)
return 1;
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index caed44d..4b9995e 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -402,8 +402,12 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
unsigned int flags;
*ignored = 1;
+
+ /*
+ * IPv6 frags, only the first hit here.
+ */
ip_vs_fill_iph_skb(svc->af, skb, &iph);
- pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
+ pptr = frag_safe_skb_hp(skb, iph.len, sizeof(_ports), _ports, &iph);
if (pptr == NULL)
return NULL;
@@ -507,8 +511,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
#endif
ip_vs_fill_iph_skb(svc->af, skb, &iph);
-
- pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
+ pptr = frag_safe_skb_hp(skb, iph.len, sizeof(_ports), _ports, &iph);
if (pptr == NULL) {
ip_vs_service_put(svc);
return NF_DROP;
@@ -654,14 +657,6 @@ static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
return err;
}
-#ifdef CONFIG_IP_VS_IPV6
-static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
-{
- /* TODO IPv6: Find out what to do here for IPv6 */
- return 0;
-}
-#endif
-
static int ip_vs_route_me_harder(int af, struct sk_buff *skb)
{
#ifdef CONFIG_IP_VS_IPV6
@@ -941,8 +936,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
ip_vs_fill_iph_skb(AF_INET6, skb, ipvsh);
*related = 1;
-
- ic = skb_header_pointer(skb, ipvsh->len, sizeof(_icmph), &_icmph);
+ ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh);
if (ic == NULL)
return NF_DROP;
@@ -961,6 +955,11 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
*related = 0;
return NF_ACCEPT;
}
+ /* Fragment header that is before ICMP header tells us that:
+ * it's not an error message since they can't be fragmented.
+ */
+ if (ipvsh->flags & IP6T_FH_F_FRAG)
+ return NF_DROP;
/* Now find the contained IP header */
ipvsh->len += sizeof(_icmph);
@@ -1117,6 +1116,12 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
ip_vs_fill_iph_skb(af, skb, &iph);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
+ if (!iph.fragoffs && skb_nfct_reasm(skb)) {
+ struct sk_buff *reasm = skb_nfct_reasm(skb);
+ /* Save fw mark for coming frags */
+ reasm->ipvs_property = 1;
+ reasm->mark = skb->mark;
+ }
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int related;
int verdict = ip_vs_out_icmp_v6(skb, &related,
@@ -1124,7 +1129,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
if (related)
return verdict;
- ip_vs_fill_iph_skb(af, skb, &iph);
}
} else
#endif
@@ -1134,7 +1138,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
if (related)
return verdict;
- ip_vs_fill_ip4hdr(skb_network_header(skb), &iph);
}
pd = ip_vs_proto_data_get(net, iph.protocol);
@@ -1167,8 +1170,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
pp->protocol == IPPROTO_SCTP)) {
__be16 _ports[2], *pptr;
- pptr = skb_header_pointer(skb, iph.len,
- sizeof(_ports), _ports);
+ pptr = frag_safe_skb_hp(skb, iph.len,
+ sizeof(_ports), _ports, &iph);
if (pptr == NULL)
return NF_ACCEPT; /* Not for me */
if (ip_vs_lookup_real_service(net, af, iph.protocol,
@@ -1468,7 +1471,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
*related = 1;
- ic = skb_header_pointer(skb, iph->len, sizeof(_icmph), &_icmph);
+ ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph, iph);
if (ic == NULL)
return NF_DROP;
@@ -1487,6 +1490,11 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
*related = 0;
return NF_ACCEPT;
}
+ /* Fragment header that is before ICMP header tells us that:
+ * it's not an error message since they can't be fragmented.
+ */
+ if (iph->flags & IP6T_FH_F_FRAG)
+ return NF_DROP;
/* Now find the contained IP header */
ciph.len = iph->len + sizeof(_icmph);
@@ -1511,10 +1519,20 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph,
"Checking incoming ICMPv6 for");
- /* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_in_get(AF_INET6, skb, &ciph, ciph.len, 1);
+ /* The embedded headers contain source and dest in reverse order
+ * if not from localhost
+ */
+ cp = pp->conn_in_get(AF_INET6, skb, &ciph, ciph.len,
+ (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1);
+
if (!cp)
return NF_ACCEPT;
+ /* VS/TUN, VS/DR and LOCALNODE just let it go */
+ if ((hooknum == NF_INET_LOCAL_OUT) &&
+ (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) {
+ __ip_vs_conn_put(cp);
+ return NF_ACCEPT;
+ }
/* do the statistics and put it back */
ip_vs_in_stats(cp, skb);
@@ -1584,6 +1602,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
+ if (!iph.fragoffs && skb_nfct_reasm(skb)) {
+ struct sk_buff *reasm = skb_nfct_reasm(skb);
+ /* Save fw mark for coming frags. */
+ reasm->ipvs_property = 1;
+ reasm->mark = skb->mark;
+ }
if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
int related;
int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
@@ -1608,13 +1632,16 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
pp = pd->pp;
/*
* Check if the packet belongs to an existing connection entry
- * Only sched first IPv6 fragment.
*/
cp = pp->conn_in_get(af, skb, &iph, iph.len, 0);
if (unlikely(!cp) && !iph.fragoffs) {
+ /* No (second) fragments need to enter here, as nf_defrag_ipv6
+ * replayed fragment zero will already have created the cp
+ */
int v;
+ /* Schedule and create new connection entry into &cp */
if (!pp->conn_schedule(af, skb, pd, &v, &cp))
return v;
}
@@ -1623,6 +1650,14 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
/* sorry, all this trouble for a no-hit :) */
IP_VS_DBG_PKT(12, af, pp, skb, 0,
"ip_vs_in: packet continues traversal as normal");
+ if (iph.fragoffs && !skb_nfct_reasm(skb)) {
+ /* Fragment that couldn't be mapped to a conn entry
+ * and don't have any pointer to a reasm skb
+ * is missing module nf_defrag_ipv6
+ */
+ IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n");
+ IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment");
+ }
return NF_ACCEPT;
}
@@ -1707,6 +1742,38 @@ ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
#ifdef CONFIG_IP_VS_IPV6
/*
+ * AF_INET6 fragment handling
+ * Copy info from first fragment, to the rest of them.
+ */
+static unsigned int
+ip_vs_preroute_frag6(unsigned int hooknum, struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct sk_buff *reasm = skb_nfct_reasm(skb);
+ struct net *net;
+
+ /* Skip if not a "replay" from nf_ct_frag6_output or first fragment.
+ * ipvs_property is set when checking first fragment
+ * in ip_vs_in() and ip_vs_out().
+ */
+ if (reasm)
+ IP_VS_DBG(2, "Fragment recv prop:%d\n", reasm->ipvs_property);
+ if (!reasm || !reasm->ipvs_property)
+ return NF_ACCEPT;
+
+ net = skb_net(skb);
+ if (!net_ipvs(net)->enable)
+ return NF_ACCEPT;
+
+ /* Copy stored fw mark, saved in ip_vs_{in,out} */
+ skb->mark = reasm->mark;
+
+ return NF_ACCEPT;
+}
+
+/*
* AF_INET6 handler in NF_INET_LOCAL_IN chain
* Schedule and forward packets from remote clients
*/
@@ -1845,6 +1912,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
.priority = 100,
},
#ifdef CONFIG_IP_VS_IPV6
+ /* After mangle & nat fetch 2:nd fragment and following */
+ {
+ .hook = ip_vs_preroute_frag6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP6_PRI_NAT_DST + 1,
+ },
/* After packet filtering, change source only for VS/NAT */
{
.hook = ip_vs_reply6,
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 428de75..e44933f 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -496,12 +496,13 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp)
{
struct rt6_info *rt; /* Route to the other host */
- struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct ip_vs_iphdr iph;
int mtu;
EnterFunction(10);
+ ip_vs_fill_iph_skb(cp->af, skb, &iph);
- if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0,
+ if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph.daddr.in6, NULL, 0,
IP_VS_RT_MODE_NON_LOCAL)))
goto tx_error_icmp;
@@ -513,7 +514,9 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
skb->dev = net->loopback_dev;
}
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ /* only send ICMP too big on first fragment */
+ if (!iph.fragoffs)
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
dst_release(&rt->dst);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
goto tx_error;
@@ -685,7 +688,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
ip_vs_fill_iph_skb(cp->af, skb, &iph);
/* check if it is a connection of no-client-port */
- if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !iph.fragoffs)) {
__be16 _pt, *p;
p = skb_header_pointer(skb, iph.len, sizeof(_pt), &_pt);
if (p == NULL)
@@ -735,7 +738,9 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
skb->dev = net->loopback_dev;
}
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ /* only send ICMP too big on first fragment */
+ if (!iph.fragoffs)
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0,
"ip_vs_nat_xmit_v6(): frag needed for");
goto tx_error_put;
@@ -940,8 +945,10 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
unsigned int max_headroom; /* The extra header space needed */
int mtu;
int ret;
+ struct ip_vs_iphdr ipvsh;
EnterFunction(10);
+ ip_vs_fill_iph_skb(cp->af, skb, &ipvsh);
if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
&saddr, 1, (IP_VS_RT_MODE_LOCAL |
@@ -970,7 +977,9 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
skb->dev = net->loopback_dev;
}
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ /* only send ICMP too big on first fragment */
+ if (!ipvsh.fragoffs)
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
goto tx_error_put;
}
@@ -1116,8 +1125,10 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
{
struct rt6_info *rt; /* Route to the other host */
int mtu;
+ struct ip_vs_iphdr iph;
EnterFunction(10);
+ ip_vs_fill_iph_skb(cp->af, skb, &iph);
if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
0, (IP_VS_RT_MODE_LOCAL |
@@ -1136,7 +1147,9 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
skb->dev = net->loopback_dev;
}
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ /* only send ICMP too big on first fragment */
+ if (!iph.fragoffs)
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
dst_release(&rt->dst);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
goto tx_error;
@@ -1308,8 +1321,10 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
int rc;
int local;
int rt_mode;
+ struct ip_vs_iphdr iph;
EnterFunction(10);
+ ip_vs_fill_iph_skb(cp->af, skb, &iph);
/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
forwarded directly here, because there is no need to
@@ -1372,7 +1387,9 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
skb->dev = net->loopback_dev;
}
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ /* only send ICMP too big on first fragment */
+ if (!iph.fragoffs)
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
IP_VS_DBG_RL("%s(): frag needed\n", __func__);
goto tx_error_put;
}
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists