[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20111021150250.GB10076@minipsycho>
Date: Fri, 21 Oct 2011 17:02:51 +0200
From: Jiri Pirko <jpirko@...hat.com>
To: Eric Dumazet <eric.dumazet@...il.com>
Cc: netdev@...r.kernel.org, davem@...emloft.net,
bhutchings@...arflare.com, shemminger@...tta.com, fubar@...ibm.com,
andy@...yhouse.net, tgraf@...radead.org, ebiederm@...ssion.com,
mirqus@...il.com, kaber@...sh.net, greearb@...delatech.com,
jesse@...ira.com, fbl@...hat.com, benjamin.poirier@...il.com,
jzupka@...hat.com
Subject: Re: [patch net-next V2] net: introduce ethernet teaming device
Fri, Oct 21, 2011 at 04:43:57PM CEST, eric.dumazet@...il.com wrote:
>Le vendredi 21 octobre 2011 à 14:39 +0200, Jiri Pirko a écrit :
>> This patch introduces new network device called team. It supposes to be
>> very fast, simple, userspace-driven alternative to existing bonding
>> driver.
>>
>> Userspace library called libteam with couple of demo apps is available
>> here:
>> https://github.com/jpirko/libteam
>> Note it's still in its dipers atm.
>>
>> team<->libteam use generic netlink for communication. That and rtnl
>> suppose to be the only way to configure team device, no sysfs etc.
>>
>> Python binding basis for libteam was recently introduced (some need
>> still need to be done on it though). Daemon providing arpmon/miimon
>> active-backup functionality will be introduced shortly.
>> All what's necessary is already implemented in kernel team driver.
>>
>> Signed-off-by: Jiri Pirko <jpirko@...hat.com>
>>
>> v1->v2:
>> - modes are made as modules. Makes team more modular and
>> extendable.
>> - several commenters' nitpicks found on v1 were fixed
>> - several other bugs were fixed.
>> - note I ignored Eric's comment about roundrobin port selector
>> as Eric's way may be easily implemented as another mode (mode
>> "random") in future.
>> ---
>
>Very nice work !
>
>> +
>> +
>> +/*
>> + * We can benefit from the fact that it's ensured no port is present
>> + * at the time of mode change.
>> + */
>> +static int __team_change_mode(struct team *team,
>> + const struct team_mode *new_mode)
>> +{
>> + /* Check if mode was previously set and do cleanup if so */
>> + if (team->mode_kind) {
>> + void (*exit_op)(struct team *team) = team->mode_ops.exit;
>> +
>> + /* Clear ops area so no callback is called any longer */
>> + memset(&team->mode_ops, 0, sizeof(struct team_mode_ops));
>
> Hmm, memset() has no atomicity guarantee about 'longs' or 'pointers'.
>
> You must make sure mode_ops.receive (and other pointers) is set not
>byte per byte, but in one go.
:( What do you suggest? Set these pointers one-by-one?
>
>> +
>> + synchronize_rcu();
>> +
>> + if (exit_op)
>> + exit_op(team);
>> + team_mode_put(team->mode_kind);
>> + team->mode_kind = NULL;
>> + /* zero private data area */
>> + memset(&team->mode_priv, 0,
>> + sizeof(struct team) - offsetof(struct team, mode_priv));
>> + }
>> +
>> + if (!new_mode)
>> + return 0;
>> +
>> + if (new_mode->ops->init) {
>> + int err;
>> +
>> + err = new_mode->ops->init(team);
>> + if (err)
>> + return err;
>> + }
>> +
>> + team->mode_kind = new_mode->kind;
>> + memcpy(&team->mode_ops, new_mode->ops, sizeof(struct team_mode_ops));
>> +
>> + return 0;
>> +}
>> +
>
>> +
>> +/************************
>> + * Rx path frame handler
>> + ************************/
>> +
>> +/* note: already called with rcu_read_lock */
>> +static rx_handler_result_t team_handle_frame(struct sk_buff **pskb)
>> +{
>> + struct sk_buff *skb = *pskb;
>> + struct team_port *port;
>> + struct team *team;
>> + rx_handler_result_t res = RX_HANDLER_ANOTHER;
>> +
>> + skb = skb_share_check(skb, GFP_ATOMIC);
>> + if (!skb)
>> + return RX_HANDLER_CONSUMED;
>> +
>> + *pskb = skb;
>> +
>> + port = team_port_get_rcu(skb->dev);
>> + team = port->team;
>> +
>> + if (team->mode_ops.receive)
>
>Hmm, you need ACCESS_ONCE() here or rcu_dereference()
>
>See commit 4d97480b1806e883eb (bonding: use local function pointer of
>bond->recv_probe in bond_handle_frame) for reference
Will do
>
>> + res = team->mode_ops.receive(team, port, skb);
>> +
>> + if (res == RX_HANDLER_ANOTHER) {
>> + struct team_pcpu_stats *pcpu_stats;
>> +
>> + pcpu_stats = this_cpu_ptr(team->pcpu_stats);
>> + u64_stats_update_begin(&pcpu_stats->syncp);
>> + pcpu_stats->rx_packets++;
>> + pcpu_stats->rx_bytes += skb->len;
>> + if (skb->pkt_type == PACKET_MULTICAST)
>> + pcpu_stats->rx_multicast++;
>> + u64_stats_update_end(&pcpu_stats->syncp);
>> +
>> + skb->dev = team->dev;
>> + } else {
>> + this_cpu_inc(team->pcpu_stats->rx_dropped);
>> + }
>> +
>> + return res;
>> +}
>> +
>
>
>> +
>> +static int team_port_enter(struct team *team, struct team_port *port)
>> +{
>> + int err = 0;
>> +
>> + dev_hold(team->dev);
>> + port->dev->priv_flags |= IFF_TEAM_PORT;
>> + if (team->mode_ops.port_enter) {
>> + err = team->mode_ops.port_enter(team, port);
>> + if (err)
>> + netdev_err(team->dev, "Device %s failed to enter team mode\n",
>> + port->dev->name);
>
> Not sure if you need to unset IFF_TEAM_PORT;
I do. I will add it.
>
>> + }
>> + return err;
>> +}
>> +
>
>
>> diff --git a/include/linux/if_team.h b/include/linux/if_team.h
>> new file mode 100644
>> index 0000000..21581a7
>> --- /dev/null
>> +++ b/include/linux/if_team.h
>> @@ -0,0 +1,233 @@
>> +/*
>> + * include/linux/if_team.h - Network team device driver header
>> + * Copyright (c) 2011 Jiri Pirko <jpirko@...hat.com>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + */
>> +
>> +#ifndef _LINUX_IF_TEAM_H_
>> +#define _LINUX_IF_TEAM_H_
>> +
>> +#ifdef __KERNEL__
>> +
>> +struct team_pcpu_stats {
>> + u64 rx_packets;
>> + u64 rx_bytes;
>> + u64 rx_multicast;
>> + u64 tx_packets;
>> + u64 tx_bytes;
>> + struct u64_stats_sync syncp;
>> + u32 rx_dropped;
>> + u32 tx_dropped;
>
> "unsigned long" for these two fields ?
I copied these from some other driver (macvlan I think). I will change
this to unsigned long.
>
>> +};
>> +
>> +struct team;
>> +
>> +struct team_port {
>> + struct net_device *dev;
>> + struct hlist_node hlist; /* node in hash list */
>> + struct list_head list; /* node in ordinary list */
>> + struct team *team;
>> + int index;
>> +
>> + /*
>> + * A place for storing original values of the device before it
>> + * become a port.
>> + */
>> + struct {
>> + unsigned char dev_addr[MAX_ADDR_LEN];
>> + unsigned int mtu;
>> + } orig;
>> +
>> + bool linkup;
>> + u32 speed;
>> + u8 duplex;
>> +
>> + struct rcu_head rcu;
>> +};
>> +
>> +struct team_mode_ops {
>> + int (*init)(struct team *team);
>> + void (*exit)(struct team *team);
>> + rx_handler_result_t (*receive)(struct team *team,
>> + struct team_port *port,
>> + struct sk_buff *skb);
>> + bool (*transmit)(struct team *team, struct sk_buff *skb);
>> + int (*port_enter)(struct team *team, struct team_port *port);
>> + void (*port_leave)(struct team *team, struct team_port *port);
>> + void (*port_change_mac)(struct team *team, struct team_port *port);
>> +};
>> +
>> +enum team_option_type {
>> + TEAM_OPTION_TYPE_U32,
>> + TEAM_OPTION_TYPE_STRING,
>> +};
>> +
>> +struct team_option {
>> + struct list_head list;
>> + const char *name;
>> + enum team_option_type type;
>> + int (*getter)(struct team *team, void *arg);
>> + int (*setter)(struct team *team, void *arg);
>> +};
>> +
>> +struct team_mode {
>> + struct list_head list;
>> + const char *kind;
>> + struct module *owner;
>> + size_t priv_size;
>> + const struct team_mode_ops *ops;
>> +};
>> +
>> +#define TEAM_MODE_PRIV_LONGS 4
>> +#define TEAM_MODE_PRIV_SIZE (sizeof(long) * TEAM_MODE_PRIV_LONGS)
>> +
>> +struct team {
>> + struct net_device *dev; /* associated netdevice */
>
>
>> + struct team_pcpu_stats __percpu *pcpu_stats;
>
> I believe you can use net_device->anonymous_union, the one with
>ml_priv :
> struct pcpu_lstats __percpu *lstats; /* loopback stats */
> struct pcpu_tstats __percpu *tstats; /* tunnel stats */
> struct pcpu_dstats __percpu *dstats; /* dummy stats */
>and add here
> struct team_pcpu_stats __percpu *team_stats;
I this the right way? I must say I do not like it too much :(
>
>> +
>> + spinlock_t lock; /* used for overall locking, e.g. port lists write */
>> +
>> + /*
>> + * port lists with port count
>> + */
>> + int port_count;
>> + struct hlist_head *port_hlist;
>
> I am not sure why you want an external hash table, with 16 pointers...
>This could be embedded here to remove one dereference ?
Good point! Will change this.
Thanks Eric!
Jirka
>
>> + struct list_head port_list;
>> +
>> + struct list_head option_list;
>> +
>> + const char *mode_kind;
>> + struct team_mode_ops mode_ops;
>> + long mode_priv[TEAM_MODE_PRIV_LONGS];
>> +};
>> +
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists