[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20100929193539.GC2864@redhat.com>
Date: Wed, 29 Sep 2010 16:35:39 -0300
From: Flavio Leitner <fbl@...hat.com>
To: Andy Gospodarek <andy@...yhouse.net>
Cc: netdev@...r.kernel.org
Subject: Re: [PATCH] bonding: rejoin multicast groups on VLANs
On Wed, Sep 29, 2010 at 02:44:13PM -0400, Andy Gospodarek wrote:
> On Wed, Sep 29, 2010 at 04:12:24AM -0300, Flavio Leitner wrote:
> > It fixes bonding to rejoin multicast groups added
> > to VLAN devices on top of bonding when a failover
> > happens.
> >
> > The first packet may be discarded, so the timer
> > assure that at least 3 Reports are sent.
> >
>
> Good find, Flavio. Clearly the fact that multicast membership is broken
> needs to be fixed, but I would rather not see timers used at all. We
> worked hard in the past to eliminate timers for several reasons, so I
> would rather see a workqueue used.
I noticed that the code is using workqueues now, just thought a
simple thing which may run couple times would fit perfectly with
a simple timer.
> I also don't like retransmitting the membership report 3 times when it
> may not be needed. Though many switches can handle it, the cost of
> receiving and processing what might be a large list of multicast
> addresses every 200ms for 600ms doesn't seem ideal. It also feels like
> a hack. :)
Definitely a parameter is much better, but I wasn't sure about
the patch approach so I was expecting a review like this and then
do the refinements needed. Better to post early, right? :)
I see your point to change the default to one membership report,
but we can't assure during a failover if everything has been
received. Also, it isn't supposed to keep failing flooding the
network, so I would rather have couple membership reports being
send than watch an important multicast application failing.
Perhaps 3 is too much, but one sounds too few to me.
what you think?
Flavio
>
> If retransmission of the membership reports is a requirement for some
> users, I would rather see it as a configuration option.
>
> Maybe something like this?
>
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 3b16f62..b7b4a74 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -109,6 +109,7 @@ static char *arp_validate;
> static char *fail_over_mac;
> static int all_slaves_active = 0;
> static struct bond_params bonding_defaults;
> +static int resend_igmp = BOND_DEFAULT_RESEND_IGMP;
>
> module_param(max_bonds, int, 0);
> MODULE_PARM_DESC(max_bonds, "Max number of bonded devices");
> @@ -163,6 +164,8 @@ module_param(all_slaves_active, int, 0);
> MODULE_PARM_DESC(all_slaves_active, "Keep all frames received on an interface"
> "by setting active flag for all slaves. "
> "0 for never (default), 1 for always.");
> +module_param(resend_igmp, int, 0);
> +MODULE_PARM_DESC(resend_igmp, "Number of IGMP membership reports to send on link failure");
>
> /*----------------------------- Global variables ----------------------------*/
>
> @@ -865,18 +868,14 @@ static void bond_mc_del(struct bonding *bond, void *addr)
> }
>
>
> -/*
> - * Retrieve the list of registered multicast addresses for the bonding
> - * device and retransmit an IGMP JOIN request to the current active
> - * slave.
> - */
> -static void bond_resend_igmp_join_requests(struct bonding *bond)
> +static void __bond_resend_igmp_join_requests(struct net_device *dev)
> {
> struct in_device *in_dev;
> struct ip_mc_list *im;
>
> rcu_read_lock();
> - in_dev = __in_dev_get_rcu(bond->dev);
> +
> + in_dev = __in_dev_get_rcu(dev);
> if (in_dev) {
> for (im = in_dev->mc_list; im; im = im->next)
> ip_mc_rejoin_group(im);
> @@ -885,6 +884,48 @@ static void bond_resend_igmp_join_requests(struct bonding *bond)
> rcu_read_unlock();
> }
>
> +
> +/*
> + * Retrieve the list of registered multicast addresses for the bonding
> + * device and retransmit an IGMP JOIN request to the current active
> + * slave.
> + */
> +static void bond_resend_igmp_join_requests(struct bonding *bond)
> +{
> + struct net_device *vlan_dev;
> + struct vlan_entry *vlan;
> +
> + read_lock(&bond->lock);
> + if (bond->kill_timers)
> + goto out;
> +
> + /* rejoin all groups on bond device */
> + __bond_resend_igmp_join_requests(bond->dev);
> +
> + /* rejoin all groups on vlan devices */
> + if (bond->vlgrp) {
> + list_for_each_entry(vlan, &bond->vlan_list, vlan_list) {
> + vlan_dev = vlan_group_get_device(bond->vlgrp,
> + vlan->vlan_id);
> + if (vlan_dev)
> + __bond_resend_igmp_join_requests(vlan_dev);
> + }
> + }
> +
> + if (--bond->igmp_retrans > 0)
> + queue_delayed_work(bond->wq, &bond->mcast_work, HZ/5);
> +
> +out:
> + read_unlock(&bond->lock);
> +}
> +
> +void bond_resend_igmp_join_requests_delayed(struct work_struct *work)
> +{
> + struct bonding *bond = container_of(work, struct bonding,
> + mcast_work.work);
> + bond_resend_igmp_join_requests(bond);
> +}
> +
> /*
> * flush all members of flush->mc_list from device dev->mc_list
> */
> @@ -944,7 +985,10 @@ static void bond_mc_swap(struct bonding *bond, struct slave *new_active,
>
> netdev_for_each_mc_addr(ha, bond->dev)
> dev_mc_add(new_active->dev, ha->addr);
> - bond_resend_igmp_join_requests(bond);
> +
> + /* rejoin multicast groups */
> + bond->igmp_retrans = bond->params.resend_igmp;
> + queue_delayed_work(bond->wq, &bond->mcast_work, 0);
> }
> }
>
> @@ -3744,6 +3788,9 @@ static int bond_open(struct net_device *bond_dev)
>
> bond->kill_timers = 0;
>
> + /* multicast retrans */
> + INIT_DELAYED_WORK(&bond->mcast_work, bond_resend_igmp_join_requests_delayed);
> +
> if (bond_is_lb(bond)) {
> /* bond_alb_initialize must be called before the timer
> * is started.
> @@ -3828,6 +3875,8 @@ static int bond_close(struct net_device *bond_dev)
> break;
> }
>
> + if (delayed_work_pending(&bond->mcast_work))
> + cancel_delayed_work(&bond->ad_work);
>
> if (bond_is_lb(bond)) {
> /* Must be called only after all
> @@ -4699,6 +4748,9 @@ static void bond_work_cancel_all(struct bonding *bond)
> if (bond->params.mode == BOND_MODE_8023AD &&
> delayed_work_pending(&bond->ad_work))
> cancel_delayed_work(&bond->ad_work);
> +
> + if (delayed_work_pending(&bond->mcast_work))
> + cancel_delayed_work(&bond->ad_work);
> }
>
> /*
> @@ -4891,6 +4943,12 @@ static int bond_check_params(struct bond_params *params)
> all_slaves_active = 0;
> }
>
> + if (resend_igmp < 0 || resend_igmp > 255) {
> + pr_warning("Warning: resend_igmp (%d) should be between "
> + "0 and 255, resetting to %d\n",
> + resend_igmp, BOND_DEFAULT_RESEND_IGMP);
> + resend_igmp = BOND_DEFAULT_RESEND_IGMP;
> + }
> /* reset values for TLB/ALB */
> if ((bond_mode == BOND_MODE_TLB) ||
> (bond_mode == BOND_MODE_ALB)) {
> @@ -5063,6 +5121,7 @@ static int bond_check_params(struct bond_params *params)
> params->fail_over_mac = fail_over_mac_value;
> params->tx_queues = tx_queues;
> params->all_slaves_active = all_slaves_active;
> + params->resend_igmp = resend_igmp;
>
> if (primary) {
> strncpy(params->primary, primary, IFNAMSIZ);
> diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
> index c6fdd85..c49bdb0 100644
> --- a/drivers/net/bonding/bonding.h
> +++ b/drivers/net/bonding/bonding.h
> @@ -136,6 +136,7 @@ struct bond_params {
> __be32 arp_targets[BOND_MAX_ARP_TARGETS];
> int tx_queues;
> int all_slaves_active;
> + int resend_igmp;
> };
>
> struct bond_parm_tbl {
> @@ -198,6 +199,7 @@ struct bonding {
> s32 slave_cnt; /* never change this value outside the attach/detach wrappers */
> rwlock_t lock;
> rwlock_t curr_slave_lock;
> + s8 igmp_retrans;
> s8 kill_timers;
> s8 send_grat_arp;
> s8 send_unsol_na;
> @@ -223,6 +225,7 @@ struct bonding {
> struct delayed_work arp_work;
> struct delayed_work alb_work;
> struct delayed_work ad_work;
> + struct delayed_work mcast_work;
> #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
> struct in6_addr master_ipv6;
> #endif
> diff --git a/include/linux/if_bonding.h b/include/linux/if_bonding.h
> index 2c79943..d2f78b7 100644
> --- a/include/linux/if_bonding.h
> +++ b/include/linux/if_bonding.h
> @@ -84,6 +84,9 @@
> #define BOND_DEFAULT_MAX_BONDS 1 /* Default maximum number of devices to support */
>
> #define BOND_DEFAULT_TX_QUEUES 16 /* Default number of tx queues per device */
> +
> +#define BOND_DEFAULT_RESEND_IGMP 1 /* Default number of IGMP membership reports
> + to resend on link failure. */
> /* hashing types */
> #define BOND_XMIT_POLICY_LAYER2 0 /* layer 2 (MAC only), default */
> #define BOND_XMIT_POLICY_LAYER34 1 /* layer 3+4 (IP ^ (TCP || UDP)) */
--
Flavio
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists