[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <24573842.post@talk.nabble.com>
Date: Mon, 20 Jul 2009 10:12:53 -0700 (PDT)
From: Franck Chionna <infos@...shuploaded.com>
To: netdev@...r.kernel.org
Subject: Re: bonding: bug in balance-alb mode (incorrect update-ARP-replies)
JUNG, Christian wrote:
>
> Hello,
>
> I've discovered a bug in the bonding module of the Linux Kernel, which
> appears
> only in bonding-mode balance-alb.
>
> Description:
>
> You have to setup a box with at least two NICs, a bonding device
> enslaving
> those, assign at least two IPs to the bond and make some traffic from
> a
> different machine to one of those IPs.
>
> If you delete that IP, the box will regardlessly send ARP-replies to
> the
> machine which communicated to that IP before removing it.
>
> This comes from the rx_hashtbl and the receive load balancing
> algorithm.
>
> The bug is very serious if bonding is used in a cluster-environment
> using
> two nodes which are connected to the same subnet. If an IP-bound
> service
> has
> to failover to the other node, the old node would announce its
> MAC-address
> for the IP which isn't owned by the node anymore. So client-traffic in
> the
> same net would hit the old node.
>
> A possible workaround could be the usage of balance-tlb instead of
> balance-alb.
>
> I've made a little patch which removes every entry from the rx_hashtbl, if
> the
> according IP is removed from the bond. The patch was made for Linux Kernel
> version 2.6.19.
>
> ---8<---
> diff -ur linux-2.6.19/drivers/net/bonding/bond_alb.c
> linux/drivers/net/bonding/bond_alb.c
> --- linux-2.6.19/drivers/net/bonding/bond_alb.c 2006-11-29
> 22:57:37.000000000 +0100
> +++ linux/drivers/net/bonding/bond_alb.c 2007-01-16
> 17:23:53.000000000 +0100
> @@ -1677,3 +1677,38 @@
> }
> }
>
> +void bond_alb_remove_ip_from_rlb(struct bonding *bond, u32 ip) {
> + struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
> + u32 curr_index;
> +
> + dprintk("%s: removing entries from rx_hashtbl for IP %lx\n",
> bond->dev->name, ip);
> + _lock_rx_hashtbl(bond);
> +
> + curr_index = bond_info->rx_hashtbl_head;
> + while (curr_index != RLB_NULL_INDEX) {
> + struct rlb_client_info *curr =
> &(bond_info->rx_hashtbl[curr_index]);
> + u32 next_index = bond_info->rx_hashtbl[curr_index].next;
> + u32 prev_index = bond_info->rx_hashtbl[curr_index].prev;
> +
> + if (curr->ip_src == ip) {
> + dprintk("%s: entry %u matched\n", bond->dev->name,
> curr_index);
> +
> + if (curr_index == bond_info->rx_hashtbl_head) {
> + bond_info->rx_hashtbl_head = next_index;
> + }
> + if (prev_index != RLB_NULL_INDEX) {
> + bond_info->rx_hashtbl[prev_index].next =
> next_index;
> + }
> + if (next_index != RLB_NULL_INDEX) {
> + bond_info->rx_hashtbl[next_index].prev =
> prev_index;
> + }
> +
> + rlb_init_table_entry(curr);
> + }
> +
> + curr_index = next_index;
> + }
> +
> + _unlock_rx_hashtbl(bond);
> +}
> +
> diff -ur linux-2.6.19/drivers/net/bonding/bond_alb.h
> linux/drivers/net/bonding/bond_alb.h
> --- linux-2.6.19/drivers/net/bonding/bond_alb.h 2006-11-29
> 22:57:37.000000000 +0100
> +++ linux/drivers/net/bonding/bond_alb.h 2007-01-16
> 17:23:53.000000000 +0100
> @@ -128,5 +128,6 @@
> void bond_alb_monitor(struct bonding *bond);
> int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
> void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
> +void bond_alb_remove_ip_from_rlb(struct bonding *bond, u32 ip);
> #endif /* __BOND_ALB_H__ */
>
> diff -ur linux-2.6.19/drivers/net/bonding/bond_main.c
> linux/drivers/net/bonding/bond_main.c
> --- linux-2.6.19/drivers/net/bonding/bond_main.c 2006-11-29
> 22:57:37.000000000 +0100
> +++ linux/drivers/net/bonding/bond_main.c 2007-01-16
> 17:30:49.000000000 +0100
> @@ -3356,6 +3356,12 @@
> return NOTIFY_OK;
> case NETDEV_DOWN:
> bond->master_ip =
> bond_glean_dev_ip(bond->dev);
> +
> + /* remove IP from RLB hashtable if using
> balance-alb mode: */
> + if (bond->params.mode == BOND_MODE_ALB) {
> + bond_alb_remove_ip_from_rlb(bond,
> ifa->ifa_local);
> + }
> +
> return NOTIFY_OK;
> default:
> return NOTIFY_DONE;
> ---8<---
>
> The function bond_alb_remove_ip_from_rlb is heavily based on the function
> rlb_clear_vlan.
>
> And here's a useful patch for debugging purposes (it outputs the
> rx_hashtbl
> in
> the proc-file of the bond):
>
> ---8<---
> diff -ur linux-2.6.19/drivers/net/bonding/bond_alb.c
> linux/drivers/net/bonding/bond_alb.c
> --- linux-2.6.19/drivers/net/bonding/bond_alb.c 2007-01-16
> 18:59:32.000000000 +0100
> +++ linux/drivers/net/bonding/bond_alb.c 2007-01-16
> 18:48:15.000000000 +0100
> @@ -26,6 +26,7 @@
> #include <linux/netdevice.h>
> #include <linux/etherdevice.h>
> #include <linux/pkt_sched.h>
> +#include <linux/seq_file.h>
> #include <linux/spinlock.h>
> #include <linux/slab.h>
> #include <linux/timer.h>
> @@ -1677,6 +1678,45 @@
> }
> }
>
> +void bond_alb_info_show(struct seq_file *seq) {
> + struct bonding *bond = seq->private;
> + struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
> + struct rlb_client_info *rx_hash_table;
> + u32 index;
> + u32 src, dst;
> +
> + seq_puts(seq, "\nALB info\n\n");
> + seq_puts(seq, " Receive Load Balancing table:\n\n");
> + seq_puts(seq, " Index Slave Server Client
> Client-MAC Asgnd\n");
> +
> + _lock_rx_hashtbl(bond);
> +
> + rx_hash_table = bond_info->rx_hashtbl;
> +
> + if (rx_hash_table != NULL) {
> + for (index = bond_info->rx_hashtbl_head;
> + index != RLB_NULL_INDEX;
> + index = rx_hash_table[index].next) {
> + src = ntohl(rx_hash_table[index].ip_src);
> + dst = ntohl(rx_hash_table[index].ip_dst);
> +
> + seq_printf(seq,
> + " %03u %8s %03u.%03u.%03u.%03u
> %03u.%03u.%03u.%03u %02x:%02x:%02x:%02x:%02x:%02x %3s\n",
> + index,
> + (rx_hash_table[index].slave != NULL
> ? rx_hash_table[index].slave->dev->name : "none"),
> + ((src >> 24) & 0xff), ((src >> 16) &
> 0xff), ((src >> 8) & 0xff), (src & 0xff),
> + ((dst >> 24) & 0xff), ((dst >> 16) &
> 0xff), ((dst >> 8) & 0xff), (dst & 0xff),
> + rx_hash_table[index].mac_dst[0],
> rx_hash_table[index].mac_dst[1],
> + rx_hash_table[index].mac_dst[2],
> rx_hash_table[index].mac_dst[3],
> + rx_hash_table[index].mac_dst[4],
> rx_hash_table[index].mac_dst[5],
> + (rx_hash_table[index].assigned ?
> "yes" : "no")
> + );
> + }
> + }
> +
> + _unlock_rx_hashtbl(bond);
> +}
> +
> void bond_alb_remove_ip_from_rlb(struct bonding *bond, u32 ip) {
> struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
> u32 curr_index;
> diff -ur linux-2.6.19/drivers/net/bonding/bond_alb.h
> linux/drivers/net/bonding/bond_alb.h
> --- linux-2.6.19/drivers/net/bonding/bond_alb.h 2007-01-16
> 18:59:32.000000000 +0100
> +++ linux/drivers/net/bonding/bond_alb.h 2007-01-16
> 19:01:46.000000000 +0100
> @@ -128,6 +128,7 @@
> void bond_alb_monitor(struct bonding *bond);
> int bond_alb_set_mac_address(struct net_device *bond_dev, void *addr);
> void bond_alb_clear_vlan(struct bonding *bond, unsigned short vlan_id);
> +void bond_alb_info_show(struct seq_file *seq);
> void bond_alb_remove_ip_from_rlb(struct bonding *bond, u32 ip);
> #endif /* __BOND_ALB_H__ */
>
> diff -ur linux-2.6.19/drivers/net/bonding/bond_main.c
> linux/drivers/net/bonding/bond_main.c
> --- linux-2.6.19/drivers/net/bonding/bond_main.c 2007-01-16
> 18:59:32.000000000 +0100
> +++ linux/drivers/net/bonding/bond_main.c 2007-01-16
> 18:48:15.000000000 +0100
> @@ -3048,6 +3048,10 @@
> ad_info.partner_system[5]);
> }
> }
> + else
> + if (bond->params.mode == BOND_MODE_ALB) {
> + bond_alb_info_show(seq);
> + }
> }
>
> static void bond_info_show_slave(struct seq_file *seq, const struct slave
> *slave)
> ---8<---
>
> I attach this example to visualize the bug. The box is named 'linux'
> (which
> has
> the two IPs 10.0.91.128 and 10.0.91.129) and the other machine (which
> makes
> some traffic) is called 'dave'. Their clocks are synchronized via NTP.
>
> ---8<---
> linux:~ # modprobe bonding miimon=100 updelay=200 mode=balance-alb
> use_carrier=0
> linux:~ # ifconfig bond0 10.0.91.128 netmask 255.255.255.0 up
> linux:~ # ifenslave bond0 eth1
> linux:~ # ifenslave bond0 eth2
> linux:~ # ip addr add 10.0.91.129 dev bond0
> linux:~ # ip addr sh bond0
> 18: bond0: <BROADCAST,MULTICAST,MASTER,UP> mtu 1500 qdisc noqueue
> link/ether 00:02:b3:55:2e:b1 brd ff:ff:ff:ff:ff:ff
> inet 10.0.91.128/24 brd 10.255.255.255 scope global bond0
> inet 10.0.91.129/32 scope global bond0
> inet6 fe80::200:ff:fe00:0/64 scope link
> valid_lft forever preferred_lft forever
> ---
>
> dave:~ # ping 10.0.91.129
> PING 10.0.91.129 (10.0.91.129) 56(84) bytes of data.
> 64 bytes from 10.0.91.129: icmp_seq=1 ttl=64 time=3.83 ms
> 64 bytes from 10.0.91.129: icmp_seq=2 ttl=64 time=0.205 ms
> [...]
> dave:~ # tcpdump -i bond0 arp host 10.0.91.129
> tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
> listening on bond0, link-type EN10MB (Ethernet), capture size 96 bytes
> 11:55:41.829735 arp reply 10.0.91.129 is-at 00:02:b3:55:2e:b1 (oui
> Unknown)
> 11:55:41.830993 arp reply 10.0.91.129 is-at 00:02:b3:55:2e:b1 (oui
> Unknown)
> 11:55:44.047261 arp reply 10.0.91.129 is-at 00:02:b3:55:2e:b1 (oui
> Unknown)
> 11:55:44.047276 arp reply 10.0.91.129 is-at 00:02:b3:55:2e:b1 (oui
> Unknown)
> [...]
>
> ---
>
> linux:~ # ip addr del 10.0.91.129 dev bond0
> linux:~ # ip addr sh bond0
> 18: bond0: <BROADCAST,MULTICAST,MASTER,UP> mtu 1500 qdisc noqueue
> link/ether 00:02:b3:55:2e:b1 brd ff:ff:ff:ff:ff:ff
> inet 10.0.91.128/24 brd 10.255.255.255 scope global bond0
> inet6 fe80::200:ff:fe00:0/64 scope link
> valid_lft forever preferred_lft forever
> linux:~ # date
> Tue Jan 16 11:55:57 CET 2007
>
> ---
>
> dave:~ # date
> Tue Jan 16 11:56:59 CET 2007
> dave:~ # tcpdump -i bond0 arp host 10.0.91.129
> tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
> listening on bond0, link-type EN10MB (Ethernet), capture size 96 bytes
> 11:57:04.305078 arp reply 10.0.91.129 is-at 00:02:b3:55:2e:b1 (oui
> Unknown)
> 11:57:04.306248 arp reply 10.0.91.129 is-at 00:02:b3:55:2e:b1 (oui
> Unknown)
> 11:57:06.704552 arp reply 10.0.91.129 is-at 00:02:b3:55:2e:b1 (oui
> Unknown)
> 11:57:06.704569 arp reply 10.0.91.129 is-at 00:02:b3:55:2e:b1 (oui
> Unknown)
> [...]
> ---8<---
>
>
> Bye
> Christian Jung
>
> PS I'm sorry but I have to use a mailer which has some handicaps. If the
> whitespaces of the patches are munged in any way I can send you the
> patches
> as
> attachment.
>
> Another thing: When shutting down a bond (e.g. ifconfig bond0 0.0.0.0
> down)
> the
> slaves keep the master IP address of the bond. Is there a special reason
> for
> this behaviour?
>
> phone: +49 6898/10-4987
> fax: +49 6898/10-54987
> http://www.saarstahl.de
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
The problem seems to be existing yet in the last kernel today (2.6.30.1)
why your patch has not been integrated in kernel community ?
--
View this message in context: http://www.nabble.com/bonding%3A-bug-in-balance-alb-mode-%28incorrect-update-ARP-replies%29-tp8527082p24573842.html
Sent from the netdev mailing list archive at Nabble.com.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists