netdev - Re: [RFC PATCH] net: add a rate_limit attribute to netdev

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CA+mtBx-Oa_Mb0YLu5W8sPcXqh80cD7rh5KvH0umzPv8YtbWEMQ@mail.gmail.com>
Date:	Wed, 10 Jul 2013 14:18:28 -0700
From:	Tom Herbert <therbert@...gle.com>
To:	John Fastabend <john.fastabend@...il.com>
Cc:	Stephen Hemminger <stephen@...workplumber.org>,
	ben@...adent.org.uk,
	"Brandeburg, Jesse" <jesse.brandeburg@...el.com>,
	Linux Netdev List <netdev@...r.kernel.org>,
	Jeff Kirsher <jeffrey.t.kirsher@...el.com>
Subject: Re: [RFC PATCH] net: add a rate_limit attribute to netdev_queue and a rtnetlink

John, thanks for posting this.

Should we have separate guaranteed and maximum rate?  Will this be
usable in the case that more than one queue shares a rate?  Do you
think other parameters, like priority weights should be done the same
way?

Tom



On Wed, Jul 10, 2013 at 7:04 AM, John Fastabend
<john.fastabend@...il.com> wrote:
> Add a rate_limit attribute to netdev_queue and a corresponding
> ndo ops to pass the parameter to the driver. Also provide an
> rtnetlink attribute to dump and set arbitrary queue attributes
> the first of which is the rate_limit.
>
> Netlink attribute layout shown here,
>
> /* Queue Attributes management
>  *      Nested layout of queue attributes is:
>  *      [IFLA_QUEUE_ATTRIBS]
>  *              [IFLA_QUEUE_ATTRIB]
>  *                      [IFLA_QUEUE_INDEX]      <required>
>  *                      [IFLA_QUEUE_RATE]       <optional>
>  *              [IFLA_QUEUE_ATTRIB]
>  *                      [...]
>  */
>
> Adding this as a queue attribute and _not_ a qdisc option allows
> using rate limits with qdisc schemes that may not align with tx rings
> and also allows using QOS schemes along with rate limits.
>
> A sample implementation is provided for ixgbe. Any improvements or
> suggestions welcome I would also be interested to know if this works
> with other hardware and if Mbps is a good default unit.
>
> The ndo op is only called when the netlink API is invoked, so drivers
> may need to adjust internal registers on link events (speed changes)
> as needed. More attributes can be added as needed.
>
> A test patch for iproute2 will be provided.
>
> Signed-off-by: John Fastabend <john.r.fastabend@...el.com>
> ---
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  |   16 ++++
>  drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c |   47 +++++++++++--
>  drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h |    2 +
>  include/linux/netdevice.h                      |   14 ++++
>  include/uapi/linux/if_link.h                   |   24 ++++++
>  net/core/rtnetlink.c                           |   90 ++++++++++++++++++++++++
>  6 files changed, 187 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index 047ebaa..6a9b99c 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -5579,6 +5579,19 @@ static void ixgbe_check_hang_subtask(struct ixgbe_adapter *adapter)
>
>  }
>
> +static void ixgbe_reset_rate_limits(struct ixgbe_adapter *adapter)
> +{
> +       struct net_device *dev = adapter->netdev;
> +       int i;
> +
> +       for (i = 0; i < dev->real_num_tx_queues; i++) {
> +               struct netdev_queue *q = netdev_get_tx_queue(dev, i);
> +               u32 rate = q->rate_limit;
> +
> +               ixgbe_set_rate_limit(adapter->netdev, i, &rate);
> +       }
> +}
> +
>  /**
>   * ixgbe_watchdog_update_link - update the link status
>   * @adapter: pointer to the device adapter structure
> @@ -5620,6 +5633,8 @@ static void ixgbe_watchdog_update_link(struct ixgbe_adapter *adapter)
>
>         adapter->link_up = link_up;
>         adapter->link_speed = link_speed;
> +
> +       ixgbe_reset_rate_limits(adapter);
>  }
>
>  static void ixgbe_update_default_up(struct ixgbe_adapter *adapter)
> @@ -7244,6 +7259,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
>         .ndo_fdb_add            = ixgbe_ndo_fdb_add,
>         .ndo_bridge_setlink     = ixgbe_ndo_bridge_setlink,
>         .ndo_bridge_getlink     = ixgbe_ndo_bridge_getlink,
> +       .ndo_set_ratelimit      = ixgbe_set_rate_limit,
>  };
>
>  /**
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
> index 1e7d587..1ea78f1 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
> @@ -1107,17 +1107,14 @@ static int ixgbe_link_mbps(struct ixgbe_adapter *adapter)
>         }
>  }
>
> -static void ixgbe_set_vf_rate_limit(struct ixgbe_adapter *adapter, int vf)
> +static u32 ixgbe_bcnrc_from_rate(struct ixgbe_adapter *adapter,
> +                                u16 tx_rate, int link_speed)
>  {
> -       struct ixgbe_ring_feature *vmdq = &adapter->ring_feature[RING_F_VMDQ];
> -       struct ixgbe_hw *hw = &adapter->hw;
>         u32 bcnrc_val = 0;
> -       u16 queue, queues_per_pool;
> -       u16 tx_rate = adapter->vfinfo[vf].tx_rate;
>
>         if (tx_rate) {
>                 /* start with base link speed value */
> -               bcnrc_val = adapter->vf_rate_link_speed;
> +               bcnrc_val = link_speed;
>
>                 /* Calculate the rate factor values to set */
>                 bcnrc_val <<= IXGBE_RTTBCNRC_RF_INT_SHIFT;
> @@ -1131,6 +1128,11 @@ static void ixgbe_set_vf_rate_limit(struct ixgbe_adapter *adapter, int vf)
>                 bcnrc_val |= IXGBE_RTTBCNRC_RS_ENA;
>         }
>
> +       return bcnrc_val;
> +}
> +
> +static void ixgbe_set_xmit_compensation(struct ixgbe_hw *hw)
> +{
>         /*
>          * Set global transmit compensation time to the MMW_SIZE in RTTBCNRM
>          * register. Typically MMW_SIZE=0x014 if 9728-byte jumbo is supported
> @@ -1146,6 +1148,39 @@ static void ixgbe_set_vf_rate_limit(struct ixgbe_adapter *adapter, int vf)
>         default:
>                 break;
>         }
> +}
> +
> +int ixgbe_set_rate_limit(struct net_device *dev, int index, u32 *tx_rate)
> +{
> +       struct ixgbe_adapter *a = netdev_priv(dev);
> +       struct ixgbe_hw *hw = &a->hw;
> +       int linkspeed = ixgbe_link_mbps(a);
> +       u8 reg_idx = a->tx_ring[index]->reg_idx;
> +       u32 bcnrc = ixgbe_bcnrc_from_rate(a, *tx_rate, linkspeed);
> +
> +       /* rate limit cannot be less than 10Mbs */
> +       if (*tx_rate && (*tx_rate <= 10))
> +               return -EINVAL;
> +
> +       ixgbe_set_xmit_compensation(hw);
> +
> +       IXGBE_WRITE_REG(hw, IXGBE_RTTDQSEL, reg_idx);
> +       IXGBE_WRITE_REG(hw, IXGBE_RTTBCNRC, bcnrc);
> +
> +       return 0;
> +}
> +
> +static void ixgbe_set_vf_rate_limit(struct ixgbe_adapter *adapter, int vf)
> +{
> +       struct ixgbe_ring_feature *vmdq = &adapter->ring_feature[RING_F_VMDQ];
> +       struct ixgbe_hw *hw = &adapter->hw;
> +       u32 bcnrc_val = 0;
> +       u16 queue, queues_per_pool;
> +       u16 tx_rate = adapter->vfinfo[vf].tx_rate;
> +
> +       bcnrc_val = ixgbe_bcnrc_from_rate(adapter, tx_rate,
> +                                         adapter->vf_rate_link_speed);
> +       ixgbe_set_xmit_compensation(hw);
>
>         /* determine how many queues per pool based on VMDq mask */
>         queues_per_pool = __ALIGN_MASK(1, ~vmdq->mask);
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
> index 4713f9f..d8b4bbe 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
> @@ -56,5 +56,7 @@ static inline void ixgbe_set_vmvir(struct ixgbe_adapter *adapter,
>         IXGBE_WRITE_REG(hw, IXGBE_VMVIR(vf), vmvir);
>  }
>
> +int ixgbe_set_rate_limit(struct net_device *dev, int index, u32 *tx_rate);
> +
>  #endif /* _IXGBE_SRIOV_H_ */
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 09b4188..0e67b09 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -574,6 +574,7 @@ struct netdev_queue {
>  #ifdef CONFIG_BQL
>         struct dql              dql;
>  #endif
> +       u32                     rate_limit;
>  } ____cacheline_aligned_in_smp;
>
>  static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
> @@ -932,6 +933,16 @@ struct netdev_fcoe_hbainfo {
>   *     that determine carrier state from physical hardware properties (eg
>   *     network cables) or protocol-dependent mechanisms (eg
>   *     USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
> + *
> + * int (*ndo_set_ratelimit)(struct net_device *dev,
> + *                         int queue_index, u32 *maxrate)
> + *     Called to set the rate limit in Mpbs of the specified queue_index
> + *     setting the limit to maxrate. It is expected that hardware may
> + *     quantize the rate limits. In these cases the driver should guarantee the
> + *     specified maxrate is not exceeded and return the set value in maxrate.
> + *     Zero should be returned on success otherwise use appropriate error code.
> + *     Drivers must handle any updates required with link speed changes within
> + *     normal driver code paths.
>   */
>  struct net_device_ops {
>         int                     (*ndo_init)(struct net_device *dev);
> @@ -1060,6 +1071,9 @@ struct net_device_ops {
>                                                       struct nlmsghdr *nlh);
>         int                     (*ndo_change_carrier)(struct net_device *dev,
>                                                       bool new_carrier);
> +       int                     (*ndo_set_ratelimit)(struct net_device *dev,
> +                                                    int queue_index,
> +                                                    u32 *max_rate);
>  };
>
>  /*
> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> index 03f6170..67a353c 100644
> --- a/include/uapi/linux/if_link.h
> +++ b/include/uapi/linux/if_link.h
> @@ -143,6 +143,7 @@ enum {
>         IFLA_NUM_TX_QUEUES,
>         IFLA_NUM_RX_QUEUES,
>         IFLA_CARRIER,
> +       IFLA_QUEUE_ATTRIBS,
>         __IFLA_MAX
>  };
>
> @@ -449,6 +450,29 @@ struct ifla_port_vsi {
>         __u8 pad[3];
>  };
>
> +/* Queue Attributes management
> + *     Nested layout of queue attributes is:
> + *     [IFLA_QUEUE_ATTRIBS]
> + *             [IFLA_QUEUE_ATTRIB]
> + *                     [IFLA_QUEUE_INDEX]      <required>
> + *                     [IFLA_QUEUE_RATE]       <optional>
> + *             [IFLA_QUEUE_ATTRIB]
> + *                     [...]
> + */
> +enum {
> +       IFLA_QUEUE_ATTRIB_UNSPEC,
> +       IFLA_QUEUE_ATTRIB,                      /* nest */
> +       __IFLA_QUEUE_ATTRIB_MAX,
> +};
> +#define IFLA_QUEUE_ATTRIB_MAX (__IFLA_QUEUE_ATTRIB_MAX - 1)
> +
> +enum {
> +       IFLA_QUEUE_UNSPEC,
> +       IFLA_QUEUE_INDEX,               /* __u32 */
> +       IFLA_QUEUE_RATE,                /* __u32 */
> +       __IFLA_QUEUE_MAX,
> +};
> +#define IFLA_QUEUE_MAX (__IFLA_QUEUE_MAX - 1)
>
>  /* IPoIB section */
>
> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
> index 9007533..d7ff1c2 100644
> --- a/net/core/rtnetlink.c
> +++ b/net/core/rtnetlink.c
> @@ -846,6 +846,45 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev)
>         return 0;
>  }
>
> +static int rtnl_queue_fill(struct sk_buff *skb, struct net_device *dev)
> +{
> +       int i;
> +       struct nlattr *attribs, *attrib = NULL;
> +
> +       attribs = nla_nest_start(skb, IFLA_QUEUE_ATTRIBS);
> +       if (!attribs)
> +               return -EMSGSIZE;
> +
> +       for (i = 0; i < dev->real_num_tx_queues; i++) {
> +               struct netdev_queue *q = netdev_get_tx_queue(dev, i);
> +
> +               if (!q->rate_limit)
> +                       continue;
> +
> +               attrib = nla_nest_start(skb, IFLA_QUEUE_ATTRIB);
> +               if (!attrib) {
> +                       nla_nest_cancel(skb, attribs);
> +                       return -EMSGSIZE;
> +               }
> +
> +               if (nla_put_u32(skb, IFLA_QUEUE_INDEX, i) ||
> +                   nla_put_u32(skb, IFLA_QUEUE_RATE, q->rate_limit)) {
> +                       nla_nest_cancel(skb, attrib);
> +                       nla_nest_cancel(skb, attribs);
> +                       return -EMSGSIZE;
> +               }
> +
> +               nla_nest_end(skb, attrib);
> +       }
> +
> +       if (attrib)
> +               nla_nest_end(skb, attribs);
> +       else
> +               nla_nest_cancel(skb, attribs);
> +
> +       return 0;
> +}
> +
>  static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
>                             int type, u32 pid, u32 seq, u32 change,
>                             unsigned int flags, u32 ext_filter_mask)
> @@ -997,6 +1036,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
>         if (rtnl_port_fill(skb, dev))
>                 goto nla_put_failure;
>
> +       if (rtnl_queue_fill(skb, dev))
> +               goto nla_put_failure;
> +
>         if (dev->rtnl_link_ops) {
>                 if (rtnl_link_fill(skb, dev) < 0)
>                         goto nla_put_failure;
> @@ -1150,6 +1192,11 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
>         [IFLA_PORT_RESPONSE]    = { .type = NLA_U16, },
>  };
>
> +static const struct nla_policy ifla_queue_policy[IFLA_QUEUE_MAX+1] = {
> +       [IFLA_QUEUE_INDEX]      = { .type = NLA_U32},
> +       [IFLA_QUEUE_RATE]       = { .type = NLA_U32},
> +};
> +
>  struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
>  {
>         struct net *net;
> @@ -1503,6 +1550,49 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
>                         goto errout;
>                 modified = 1;
>         }
> +       if (tb[IFLA_QUEUE_ATTRIBS]) {
> +               struct nlattr *attr;
> +               int rem;
> +
> +               nla_for_each_nested(attr, tb[IFLA_QUEUE_ATTRIBS], rem) {
> +                       struct nlattr *queue[IFLA_QUEUE_MAX+1];
> +                       struct netdev_queue *q = NULL;
> +                       unsigned int qindex, qrate;
> +
> +                       if (nla_type(attr) != IFLA_QUEUE_ATTRIB)
> +                               continue;
> +
> +                       err = nla_parse_nested(queue, IFLA_QUEUE_MAX,
> +                                              attr, ifla_queue_policy);
> +                       if  (err < 0)
> +                               goto errout;
> +
> +                       err = -EINVAL;
> +                       if (queue[IFLA_QUEUE_INDEX]) {
> +                               qindex = nla_get_u32(queue[IFLA_QUEUE_INDEX]);
> +                               if (qindex >= dev->real_num_tx_queues)
> +                                       goto errout;
> +
> +                               q = netdev_get_tx_queue(dev, qindex);
> +                       } else {
> +                               goto errout;
> +                       }
> +
> +                       err = -EOPNOTSUPP;
> +                       if (queue[IFLA_QUEUE_RATE]) {
> +                               if (!ops->ndo_set_ratelimit)
> +                                       goto errout;
> +
> +                               qrate = nla_get_u32(queue[IFLA_QUEUE_RATE]);
> +                               err = ops->ndo_set_ratelimit(dev,
> +                                                            qindex, &qrate);
> +                               if (err < 0)
> +                                       goto errout;
> +                               q->rate_limit = qrate;
> +                               modified = 1;
> +                       }
> +               }
> +       }
>
>         if (tb[IFLA_AF_SPEC]) {
>                 struct nlattr *af;
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html