[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1303927934.2875.82.camel@bwh-desktop>
Date: Wed, 27 Apr 2011 19:12:14 +0100
From: Ben Hutchings <bhutchings@...arflare.com>
To: Alexander Duyck <alexander.h.duyck@...el.com>
Cc: davem@...emloft.net, jeffrey.t.kirsher@...el.com,
netdev@...r.kernel.org
Subject: Re: [ethtool PATCH 5/6] v4 Add RX packet classification interface
On Thu, 2011-04-21 at 13:40 -0700, Alexander Duyck wrote:
[...]
> diff --git a/ethtool.c b/ethtool.c
> index 15af86a..421fe20 100644
> --- a/ethtool.c
> +++ b/ethtool.c
[...]
> @@ -926,16 +862,45 @@ static void parse_cmdline(int argc, char **argp)
> i = argc;
> break;
> }
> - if (mode == MODE_SNTUPLE) {
> + if (mode == MODE_SCLSRULE) {
> if (!strcmp(argp[i], "flow-type")) {
> i += 1;
> if (i >= argc) {
> exit_bad_args();
> break;
> }
> - parse_rxntupleopts(argc, argp, i);
> - i = argc;
> - break;
> + if (rxclass_parse_ruleopts(&argp[i],
> + argc - i,
> + &rx_rule_fs) < 0) {
> + exit_bad_args();
> + } else {
> + i = argc;
> + rx_class_rule_added = 1;
> + }
> + } else if (!strcmp(argp[i], "class-rule-del")) {
A shorter keyword like 'delete' would do, since the -U option is only
used for flow classification.
> + i += 1;
> + if (i >= argc) {
> + exit_bad_args();
> + break;
> + }
> + rx_class_rule_del =
> + get_uint_range(argp[i], 0,
> + INT_MAX);
> + } else {
> + exit_bad_args();
> + }
> + break;
> + }
> + if (mode == MODE_GCLSRULE) {
> + if (!strcmp(argp[i], "class-rule")) {
This keyword seems redundant.
[...]
> @@ -3163,21 +3068,130 @@ static int do_permaddr(int fd, struct ifreq *ifr)
> return err;
> }
>
> +static int flow_spec_to_ntuple(struct ethtool_rx_flow_spec *fsp,
> + struct ethtool_rx_ntuple_flow_spec *ntuple)
> +{
> + int i;
Should be size_t, since it's compared with a sizeof() expression.
> + /* verify location is not specified */
> + if (fsp->location != RX_CLS_LOC_UNSPEC)
> + return -1;
> +
> + /* verify ring cookie can transfer to action */
> + if (fsp->ring_cookie > INT_MAX &&
> + ~fsp->ring_cookie > 1)
> + return -1;
The second part of this condition would be more clearly expressed as:
fsp->ring_cookie < (u64)(-2)
> + /* verify only one field is setting data field */
> + if ((fsp->flow_type & FLOW_EXT) &&
> + (fsp->m_ext.data[0] || fsp->m_ext.data[1]) &&
> + fsp->m_ext.vlan_etype)
> + return -1;
> +
> + /* initialize entire ntuple to all 0xFF */
> + memset(ntuple, ~0, sizeof(*ntuple));
The comment needs to explain *why* the value is ~0 rather than 0. I
assume the idea is to set the masks to ~0 if they are not initialised
below.
> + /* set non-filter values */
> + ntuple->flow_type = fsp->flow_type;
> + ntuple->action = fsp->ring_cookie;
> +
> + /* copy header portion over */
> + memcpy(&ntuple->h_u, &fsp->h_u, sizeof(fsp->h_u));
This deserves a comment that the two h_u fields are different unions,
but are defined identically except for padding at the end.
> + /* copy mask portion over and invert */
> + memcpy(&ntuple->m_u, &fsp->m_u, sizeof(fsp->m_u));
> + for (i = 0; i < sizeof(fsp->m_u); i++)
> + ntuple->m_u.hdata[i] ^= 0xFF;
> +
> + /* copy extended fields */
> + if (fsp->flow_type & FLOW_EXT) {
> + ntuple->vlan_tag =
> + ntohs(fsp->h_ext.vlan_tci);
> + ntuple->vlan_tag_mask =
> + ~ntohs(fsp->m_ext.vlan_tci);
> + if (fsp->m_ext.vlan_etype) {
> + ntuple->data =
> + ntohl(fsp->h_ext.vlan_etype);
> + ntuple->data_mask =
> + ~(u64)ntohl(fsp->m_ext.vlan_etype);
> + } else {
> + ntuple->data =
> + (u64)ntohl(fsp->h_ext.data[0]);
> + ntuple->data |=
> + (u64)ntohl(fsp->h_ext.data[1]) << 32;
> + ntuple->data_mask =
> + (u64)ntohl(~fsp->m_ext.data[0]);
> + ntuple->data_mask |=
> + (u64)ntohl(~fsp->m_ext.data[1]) << 32;
> + }
> + }
I think it would be clearer to add:
else {
ntuple->vlan_tag_mask = ~(u16)0;
ntuple->data_mask = ~(u64)0;
}
rather than use memset() above.
> + return 0;
> +}
> +
> static int do_srxntuple(int fd, struct ifreq *ifr)
> {
> + struct ethtool_rx_ntuple ntuplecmd;
> + struct ethtool_value eval;
> int err;
>
> - if (sntuple_changed) {
> - struct ethtool_rx_ntuple ntuplecmd;
> + /* verify if Ntuple is supported on the HW */
This comment is inaccurate.
> + err = flow_spec_to_ntuple(&rx_rule_fs, &ntuplecmd.fs);
> + if (err)
> + return -1;
> +
> + /*
> + * Check to see if the flag is set for N-tuple, this allows
> + * us to avoid the possible EINVAL response for the N-tuple
> + * flag not being set on the device
> + */
> + eval.cmd = ETHTOOL_GFLAGS;
> + ifr->ifr_data = (caddr_t)&eval;
> + err = ioctl(fd, SIOCETHTOOL, ifr);
> + if (err || !(eval.data & ETH_FLAG_NTUPLE))
> + return -1;
>
> - ntuplecmd.cmd = ETHTOOL_SRXNTUPLE;
> - memcpy(&ntuplecmd.fs, &ntuple_fs,
> - sizeof(struct ethtool_rx_ntuple_flow_spec));
> + /* send rule via N-tuple */
> + ntuplecmd.cmd = ETHTOOL_SRXNTUPLE;
> + ifr->ifr_data = (caddr_t)&ntuplecmd;
> + err = ioctl(fd, SIOCETHTOOL, ifr);
>
> - ifr->ifr_data = (caddr_t)&ntuplecmd;
> - err = ioctl(fd, SIOCETHTOOL, ifr);
> - if (err < 0)
> - perror("Cannot add new RX n-tuple filter");
> + /*
> + * Display error only if reponse is something other than op not
> + * supported. It is possible that the interface uses the network
> + * flow classifier interface instead of N-tuple.
> + */
> + if (err && errno != EOPNOTSUPP)
> + perror("Cannot add new rule via N-tuple");
> +
> + return err;
> +}
> +
> +static int do_srxclsrule(int fd, struct ifreq *ifr)
> +{
> + int err;
> +
> + if (rx_class_rule_added) {
> + /* attempt to add rule via N-tuple specifier */
> + err = do_srxntuple(fd, ifr);
> + if (!err)
> + return 0;
> +
> + /* attempt to add rule via network flow classifier */
> + err = rxclass_rule_ins(fd, ifr, &rx_rule_fs);
> + if (err < 0) {
> + fprintf(stderr, "Cannot insert"
> + " classification rule\n");
> + return 1;
> + }
Is this the right order to try them? I'm not sure.
> + } else if (rx_class_rule_del >= 0) {
> + err = rxclass_rule_del(fd, ifr, rx_class_rule_del);
> +
> + if (err < 0) {
> + fprintf(stderr, "Cannot delete"
> + " classification rule\n");
> + return 1;
> + }
> } else {
> exit_bad_args();
> }
[...]
> diff --git a/rxclass.c b/rxclass.c
> new file mode 100644
> index 0000000..5ad0639
> --- /dev/null
> +++ b/rxclass.c
> @@ -0,0 +1,1106 @@
> +/*
> + * Copyright (C) 2008 Sun Microsystems, Inc. All rights reserved.
> + */
> +#include <stdio.h>
> +#include <stdint.h>
> +#include <stddef.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <errno.h>
> +
> +#include <linux/sockios.h>
> +#include <arpa/inet.h>
> +#include "ethtool-util.h"
> +#include "ethtool-bitops.h"
> +
> +/*
> + * This is a rule manager implementation for ordering rx flow
> + * classification rules in a longest prefix first match order.
> + * The assumption is that this rule manager is the only one adding rules to
> + * the device's hardware classifier.
> + */
> +
> +struct rmgr_ctrl {
> + /* slot contains a bitmap indicating which filters are valid */
> + unsigned long *slot;
> + __u32 n_rules;
> + __u32 size;
> +};
> +
> +static struct rmgr_ctrl rmgr;
> +static int rmgr_init_done = 0;
> +
> +static void invert_flow_mask(struct ethtool_rx_flow_spec *fsp)
> +{
> + int i;
Should be size_t, since it's compared with a sizeof() expression.
> + for (i = 0; i < sizeof(fsp->m_u); i++)
> + fsp->m_u.hdata[i] ^= 0xFF;
> +}
> +
> +static void rmgr_print_nfc_spec_ext(struct ethtool_rx_flow_spec *fsp)
> +{
> + u64 data, datam;
> + __u16 etype, etypem, tci, tcim;
> +
> + if (!(fsp->flow_type & FLOW_EXT))
> + return;
> +
> + etype = ntohs(fsp->h_ext.vlan_etype);
> + etypem = ntohs(~fsp->m_ext.vlan_etype);
> + tci = ntohs(fsp->h_ext.vlan_tci);
> + tcim = ntohs(~fsp->m_ext.vlan_tci);
> + data = (u64)ntohl(fsp->h_ext.data[0]) << 32;
> + data = (u64)ntohl(fsp->h_ext.data[1]);
> + datam = (u64)ntohl(~fsp->m_ext.data[0]) << 32;
> + datam |= (u64)ntohl(~fsp->m_ext.data[1]);
> +
> + fprintf(stdout,
> + "\tVLAN EtherType: 0x%x mask: 0x%x\n"
Lower-case 'ethertype', please.
> + "\tVLAN: 0x%x mask: 0x%x\n"
> + "\tUser-defined: 0x%Lx mask: 0x%Lx\n",
'L' is not documented as an integer modifier for printf(). Use 'll'
instead.
> + etype, etypem, tci, tcim, data, datam);
> +}
> +
> +static void rmgr_print_nfc_rule(struct ethtool_rx_flow_spec *fsp)
> +{
> + unsigned char *smac, *smacm, *dmac, *dmacm;
> + __u32 sip, dip, sipm, dipm, flow_type;
> + __u16 proto, protom;
> +
> + if (fsp->location != RX_CLS_LOC_UNSPEC)
> + fprintf(stdout, "Filter: %d\n", fsp->location);
> + else
> + fprintf(stdout, "Filter: Unspecified\n");
> +
> + flow_type = fsp->flow_type & ~FLOW_EXT;
> +
> + invert_flow_mask(fsp);
> +
> + switch (flow_type) {
> + case TCP_V4_FLOW:
> + case UDP_V4_FLOW:
> + case SCTP_V4_FLOW:
> + case AH_V4_FLOW:
> + case ESP_V4_FLOW:
> + case IP_USER_FLOW:
> + sip = ntohl(fsp->h_u.tcp_ip4_spec.ip4src);
> + dip = ntohl(fsp->h_u.tcp_ip4_spec.ip4dst);
> + sipm = ntohl(fsp->m_u.tcp_ip4_spec.ip4src);
> + dipm = ntohl(fsp->m_u.tcp_ip4_spec.ip4dst);
I think this will work for AH_V4, ESP_V4 and IP_USER due to similar
structure layout, but it's kind of dodgy. Please handle each structure
type separately.
[...]
> +static int rmgr_del(__u32 loc)
> +{
> + /* verify rule exists before attempting to delete */
> + int err = rmgr_find(loc);
> + if (err)
> + return err;
> +
> + /* clear bit for the rule */
> + clear_bit(loc, rmgr.slot);
> +
> + return 0;
> +}
> +
> +static int rmgr_add(struct ethtool_rx_flow_spec *fsp)
> +{
> + __u32 loc = fsp->location;
> +
> + /* location provided, insert rule and update regions to match rule */
> + if (loc != RX_CLS_LOC_UNSPEC)
> + return rmgr_ins(loc);
> +
> + /* start at the end of the list since it is lowest priority */
> + loc = rmgr.size - 1;
> +
> + /* only part of last word is set so fill in remaining bits and test */
> + if (!~(rmgr.slot[loc / BITS_PER_LONG] |
> + (~1UL << (loc % BITS_PER_LONG))))
> + loc -= 1 + (loc % BITS_PER_LONG);
I think this is meant to avoid the need to search for bits in a partial
word but it's a very non-obvious way to do that.
> + /* find an open slot */
> + while (loc != RX_CLS_LOC_UNSPEC && !~rmgr.slot[loc / BITS_PER_LONG])
> + loc -= BITS_PER_LONG;
Just because RX_CLS_LOC_UNSPEC happens to be equal to -1 doesn't mean
it's a good idea to use the name here!
> + /* find and use available location in slot */
> + while (loc != RX_CLS_LOC_UNSPEC && test_bit(loc, rmgr.slot))
> + loc--;
> +
> + /* location found, insert rule */
> + if (loc != RX_CLS_LOC_UNSPEC) {
> + fsp->location = loc;
> + return rmgr_ins(loc);
> + }
I think it would be clearer to use a separate word index:
__u32 loc;
int i;
/* location provided, insert rule and update regions to match rule */
if (fs->location != RX_CLS_LOC_UNSPEC)
return rmgr_ins(fs->location);
/* start at the end of the list since it is lowest priority */
for (i = rmgr.size / BITS_PER_LONG - 1; i >= 0; i--) {
if (~rmgr.slot[i]) {
loc = i * BITS_PER_LONG - 1;
while (test_bit(loc, rmgr.slot))
loc--;
/* location found, insert rule */
fsp->location = loc;
return rmgr_ins(loc);
}
}
[...]
> +int rxclass_rule_getall(int fd, struct ifreq *ifr)
> +{
> + struct ethtool_rxnfc nfccmd;
> + int err, i, j;
> +
> + /* init table of available rules */
> + err = rmgr_init(fd, ifr);
> + if (err < 0)
> + return err;
> +
> + fprintf(stdout, "Total %d rules\n\n", rmgr.n_rules);
> +
> + /* fetch and display all available rules */
> + for (i = 0; i < rmgr.size; i += BITS_PER_LONG) {
> + if (!~rmgr.slot[i / BITS_PER_LONG])
> + continue;
The '~' is wrong. We want to skip words where all bits are clear, not
where all bits are set.
[...]
> +int rxclass_rule_get(int fd, struct ifreq *ifr, __u32 loc)
> +{
> + struct ethtool_rxnfc nfccmd;
> + int err;
> +
> + /* init table of available rules */
> + err = rmgr_init(fd, ifr);
> + if (err < 0)
> + return err;
> +
> + /* verify rule exists before attempting to display */
> + err = rmgr_find(loc);
> + if (err < 0)
> + return err;
Is this really necessary? Shouldn't we let the driver check that for
us, and save the cost of initialising the rule manager?
[...]
> +int rxclass_rule_ins(int fd, struct ifreq *ifr,
> + struct ethtool_rx_flow_spec *fsp)
> +{
> + struct ethtool_rxnfc nfccmd;
> + int err;
> +
> + /* init table of available rules */
> + err = rmgr_init(fd, ifr);
> + if (err < 0)
> + return err;
> +
> + /* verify rule location */
> + err = rmgr_add(fsp);
> + if (err < 0)
> + return err;
> +
> + /* notify netdev of new rule */
> + nfccmd.cmd = ETHTOOL_SRXCLSRLINS;
> + nfccmd.fs = *fsp;
> + ifr->ifr_data = (caddr_t)&nfccmd;
> + err = ioctl(fd, SIOCETHTOOL, ifr);
> + if (err < 0) {
> + perror("rmgr: Cannot insert RX class rule");
> + return -1;
> + }
> + rmgr.n_rules++;
If we're about to destroy the rule manager immediately afterwards, why
bother maintaining n_rules?
> + printf("Added rule with ID %d\n", fsp->location);
> +
> + rmgr_cleanup();
> +
> + return 0;
> +}
> +
> +int rxclass_rule_del(int fd, struct ifreq *ifr, __u32 loc)
> +{
> + struct ethtool_rxnfc nfccmd;
> + int err;
> +
> + /* init table of available rules */
> + err = rmgr_init(fd, ifr);
> + if (err < 0)
> + return err;
> +
> + /* verify rule exists */
> + err = rmgr_del(loc);
> + if (err < 0)
> + return err;
Again, I think that can be left to the driver.
[...]
> +typedef enum {
> + OPT_NONE = 0,
> + OPT_S32,
> + OPT_U8,
> + OPT_U16,
> + OPT_U32,
> + OPT_U64,
> + OPT_BE16,
> + OPT_BE32,
> + OPT_BE64,
> + OPT_IP4,
> + OPT_MAC,
> +} rule_opt_type_t;
> +
> +#define NFC_FLAG_RING 0x001
> +#define NFC_FLAG_LOC 0x002
> +#define NFC_FLAG_SADDR 0x004
> +#define NFC_FLAG_DADDR 0x008
> +#define NFC_FLAG_SPORT 0x010
> +#define NFC_FLAG_DPORT 0x020
> +#define NFC_FLAG_SPI 0x030
> +#define NFC_FLAG_TOS 0x040
> +#define NFC_FLAG_PROTO 0x080
> +#define NTUPLE_FLAG_VLAN 0x100
> +#define NTUPLE_FLAG_UDEF 0x200
> +#define NTUPLE_FLAG_VETH 0x400
> +
> +struct rule_opts {
> + const char *name;
> + rule_opt_type_t type;
> + u32 flag;
> + int offset;
> + int moffset;
> +};
I think that this ought to be merged with the argument parsing in
ethtool.c, but I won't insist that you do that immedaitely. However:
[...]
> +static int rxclass_get_ipv4(char *str, __be32 *val)
> +{
> + if (!strchr(str, '.')) {
> + unsigned long long v;
> + int err;
> +
> + err = rxclass_get_ulong(str, &v, 32);
> + if (err)
> + return -1;
> +
> + *val = htonl((u32)v);
> +
> + return 0;
> + }
> +
> + if (!inet_pton(AF_INET, str, val))
> + return -1;
There's no need to make a special case for integers. inet_aton() should
handle all the historically supported IPv4 literal formats.
[...]
> +int rxclass_parse_ruleopts(char **argp, int argc,
> + struct ethtool_rx_flow_spec *fsp)
> +{
> + const struct rule_opts *options;
> + unsigned char *p = (unsigned char *)fsp;
> + int i = 0, n_opts, err;
> + u32 flags = 0;
> + int flow_type;
> +
> + if (*argp == NULL || **argp == '\0' || argc < 1)
> + goto syntax_err;
argc < 1 is sufficient.
> + if (!strcmp(argp[0], "tcp4"))
> + flow_type = TCP_V4_FLOW;
> + else if (!strcmp(argp[0], "udp4"))
> + flow_type = UDP_V4_FLOW;
> + else if (!strcmp(argp[0], "sctp4"))
> + flow_type = SCTP_V4_FLOW;
> + else if (!strcmp(argp[0], "ah4"))
> + flow_type = AH_V4_FLOW;
> + else if (!strcmp(argp[0], "esp4"))
> + flow_type = ESP_V4_FLOW;
> + else if (!strcmp(argp[0], "ip4"))
> + flow_type = IP_USER_FLOW;
> + else if (!strcmp(argp[0], "ether"))
> + flow_type = ETHER_FLOW;
> + else
> + goto syntax_err;
> +
> + switch (flow_type) {
> + case TCP_V4_FLOW:
> + case UDP_V4_FLOW:
> + case SCTP_V4_FLOW:
> + options = rule_nfc_tcp_ip4;
> + n_opts = ARRAY_SIZE(rule_nfc_tcp_ip4);
> + break;
> + case AH_V4_FLOW:
> + case ESP_V4_FLOW:
> + options = rule_nfc_esp_ip4;
> + n_opts = ARRAY_SIZE(rule_nfc_esp_ip4);
> + break;
> + case IP_USER_FLOW:
> + options = rule_nfc_usr_ip4;
> + n_opts = ARRAY_SIZE(rule_nfc_usr_ip4);
> + break;
> + case ETHER_FLOW:
> + options = rule_nfc_ether;
> + n_opts = ARRAY_SIZE(rule_nfc_ether);
> + break;
> + default:
> + fprintf(stdout, "Add rule, invalid rule type[%s]\n", argp[0]);
stderr, not stdout
[...]
> + if (idx == n_opts) {
> + fprintf(stdout, "Add rule, unreconized option[%s]\n",
Typo: 'unreconized' should be 'unrecognized'.
> + argp[i]);
> + return -1;
> + }
> + }
> +
> + if (flags & (NTUPLE_FLAG_VLAN | NTUPLE_FLAG_UDEF | NTUPLE_FLAG_VETH))
> + fsp->flow_type |= FLOW_EXT;
> +
> + return 0;
> +
> +syntax_err:
> + fprintf(stdout, "Add rule, invalid syntax\n");
stderr
> + return -1;
> +}
>
Ben.
--
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists