[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1279078875.2444.103.camel@edumazet-laptop>
Date: Wed, 14 Jul 2010 05:41:15 +0200
From: Eric Dumazet <eric.dumazet@...il.com>
To: Changli Gao <xiaosuo@...il.com>
Cc: Jamal Hadi Salim <hadi@...erus.ca>,
"David S. Miller" <davem@...emloft.net>,
Patrick McHardy <kaber@...sh.net>,
Tom Herbert <therbert@...gle.com>, netdev@...r.kernel.org
Subject: Re: [PATCH RFC] act_cpu: packet distributing
Le mercredi 14 juillet 2010 à 11:17 +0800, Changli Gao a écrit :
> I want to know if I can assign the sk to skb as nf_tproxy_core does to avoid
> the duplicate search later. Thanks.
>
I suggest we first correct bugs before adding new features.
For me, tproxy is a high suspect at this moment, I would not use it on
production machine.
> act_cpu: packet distributing
>
> This TC action can be used to redirect packets to the CPU:
> * specified by the cpuid option
> * specified by the class minor ID obtained previously
> * on which the corresponding application runs
>
> It supports the similar functions of RPS and RFS, but is more flexible.
>
Thins kind of claims is disgusting.
No documentation, I have no idea how you setup the thing.
RPS still misses documentation, but activating it is easy.
> Signed-off-by: Changli Gao <xiaosuo@...il.com>
> ----
> include/linux/netdevice.h | 2
> include/linux/tc_act/tc_cpu.h | 31 +++++
> include/net/sock.h | 24 +++
> include/net/tc_act/tc_cpu.h | 18 ++
> net/core/dev.c | 4
> net/core/sock.c | 1
> net/sched/Kconfig | 12 +
> net/sched/Makefile | 1
> net/sched/act_cpu.c | 260 ++++++++++++++++++++++++++++++++++++++++++
> 9 files changed, 350 insertions(+), 3 deletions(-)
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index c4fedf0..318d422 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1435,6 +1435,8 @@ static inline void input_queue_tail_incr_save(struct softnet_data *sd,
>
> DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
>
> +int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail);
> +
> #define HAVE_NETIF_QUEUE
>
> extern void __netif_schedule(struct Qdisc *q);
> diff --git a/include/linux/tc_act/tc_cpu.h b/include/linux/tc_act/tc_cpu.h
> new file mode 100644
> index 0000000..2704607
> --- /dev/null
> +++ b/include/linux/tc_act/tc_cpu.h
> @@ -0,0 +1,31 @@
> +#ifndef __LINUX_TC_CPU_H
> +#define __LINUX_TC_CPU_H
> +
> +#include <linux/pkt_cls.h>
> +#include <linux/types.h>
> +
> +#define TCA_ACT_CPU 12
> +
> +enum {
> + TCA_CPU_UNSPEC,
> + TCA_CPU_PARMS,
> + TCA_CPU_TM,
> + __TCA_CPU_MAX
> +};
> +#define TCA_CPU_MAX (__TCA_CPU_MAX - 1)
> +
> +enum {
> + TCA_CPU_TYPE_MAP,
> + TCA_CPU_TYPE_CPUID,
> + TCA_CPU_TYPE_SOCKET,
> + __TCA_CPU_TYPE_MAX
> +};
> +#define TCA_CPU_TYPE_MAX (__TCA_CPU_TYPE_MAX - 1)
> +
> +struct tc_cpu {
> + tc_gen;
> + __u32 type;
> + __u32 value;
> +};
> +
> +#endif /* __LINUX_TC_CPU_H */
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 3100e71..7913158 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -200,6 +200,7 @@ struct sock_common {
> * @sk_rcvtimeo: %SO_RCVTIMEO setting
> * @sk_sndtimeo: %SO_SNDTIMEO setting
> * @sk_rxhash: flow hash received from netif layer
> + * @sk_cpu: the CPU on which the corresponding process works.
> * @sk_filter: socket filtering instructions
> * @sk_protinfo: private area, net family specific, when not using slab
> * @sk_timer: sock cleanup timer
> @@ -284,6 +285,9 @@ struct sock {
> #ifdef CONFIG_RPS
> __u32 sk_rxhash;
> #endif
> +#if defined(CONFIG_NET_ACT_CPU) || defined(CONFIG_NET_ACT_CPU_MODULE)
> + int sk_cpu;
> +#endif
> unsigned long sk_flags;
> unsigned long sk_lingertime;
> struct sk_buff_head sk_error_queue;
> @@ -639,7 +643,24 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
> return sk->sk_backlog_rcv(sk, skb);
> }
>
> -static inline void sock_rps_record_flow(const struct sock *sk)
> +static inline void sock_reset_cpu(struct sock *sk)
> +{
> +#if defined(CONFIG_NET_ACT_CPU) || defined(CONFIG_NET_ACT_CPU_MODULE)
> + sk->sk_cpu = nr_cpumask_bits;
> +#endif
> +}
> +
> +static inline void sock_save_cpu(struct sock *sk)
> +{
> +#if defined(CONFIG_NET_ACT_CPU) || defined(CONFIG_NET_ACT_CPU_MODULE)
> + int cpu = get_cpu();
> + if (sk->sk_cpu != cpu)
> + sk->sk_cpu = cpu;
> + put_cpu();
sk->sk_cpu = raw_smp_processor_id();
> +#endif
> +}
> +
> +static inline void sock_rps_record_flow(struct sock *sk)
> {
> #ifdef CONFIG_RPS
> struct rps_sock_flow_table *sock_flow_table;
> @@ -649,6 +670,7 @@ static inline void sock_rps_record_flow(const struct sock *sk)
> rps_record_sock_flow(sock_flow_table, sk->sk_rxhash);
> rcu_read_unlock();
> #endif
> + sock_save_cpu(sk);
> }
>
> static inline void sock_rps_reset_flow(const struct sock *sk)
> diff --git a/include/net/tc_act/tc_cpu.h b/include/net/tc_act/tc_cpu.h
> new file mode 100644
> index 0000000..0504bf0
> --- /dev/null
> +++ b/include/net/tc_act/tc_cpu.h
> @@ -0,0 +1,18 @@
> +#ifndef __NET_TC_CPU_H
> +#define __NET_TC_CPU_H
> +
> +#include <linux/types.h>
> +#include <net/act_api.h>
> +
> +struct tcf_cpu {
> + struct tcf_common common;
> + u32 type;
> + u32 value;
> +};
> +
> +static inline struct tcf_cpu *to_tcf_cpu(struct tcf_common *pc)
> +{
> + return container_of(pc, struct tcf_cpu, common);
> +}
> +
> +#endif /* __NET_TC_CPU_H */
> diff --git a/net/core/dev.c b/net/core/dev.c
> index e2b9fa2..45e8a21 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2443,8 +2443,7 @@ static int rps_ipi_queued(struct softnet_data *sd)
> * enqueue_to_backlog is called to queue an skb to a per CPU backlog
> * queue (may be a remote CPU queue).
> */
> -static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
> - unsigned int *qtail)
> +int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail)
> {
> struct softnet_data *sd;
> unsigned long flags;
> @@ -2482,6 +2481,7 @@ enqueue:
> kfree_skb(skb);
> return NET_RX_DROP;
> }
> +EXPORT_SYMBOL(enqueue_to_backlog);
>
> /**
> * netif_rx - post buffer to the network code
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 363bc26..7a71e76 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1045,6 +1045,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
> if (!try_module_get(prot->owner))
> goto out_free_sec;
> sk_tx_queue_clear(sk);
> + sock_reset_cpu(sk);
> }
>
> return sk;
> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index 2f691fb..a826758 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -427,6 +427,18 @@ config NET_CLS_ACT
> A recent version of the iproute2 package is required to use
> extended matches.
>
> +config NET_ACT_CPU
> + tristate "Packet distributing"
> + depends on NET_CLS_ACT
> + depends on RPS
> + ---help---
> + Say Y here to do packets distributing. With it, you can distribute
> + the packets among the CPUs on the system in any way as you like. It
> + only works in ingress.
> +
> + To compile this code as a module, choose M here: the
> + module will be called act_cpu.
> +
> config NET_ACT_POLICE
> tristate "Traffic Policing"
> depends on NET_CLS_ACT
> diff --git a/net/sched/Makefile b/net/sched/Makefile
> index f14e71b..a9e0b96 100644
> --- a/net/sched/Makefile
> +++ b/net/sched/Makefile
> @@ -7,6 +7,7 @@ obj-y := sch_generic.o sch_mq.o
> obj-$(CONFIG_NET_SCHED) += sch_api.o sch_blackhole.o
> obj-$(CONFIG_NET_CLS) += cls_api.o
> obj-$(CONFIG_NET_CLS_ACT) += act_api.o
> +obj-$(CONFIG_NET_ACT_CPU) += act_cpu.o
> obj-$(CONFIG_NET_ACT_POLICE) += act_police.o
> obj-$(CONFIG_NET_ACT_GACT) += act_gact.o
> obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o
> diff --git a/net/sched/act_cpu.c b/net/sched/act_cpu.c
> new file mode 100644
> index 0000000..6b7d7fc
> --- /dev/null
> +++ b/net/sched/act_cpu.c
> @@ -0,0 +1,260 @@
> +/*
> + * Packet distributing actions
> + *
> + * Copyright (c) 2010- Changli Gao <xiaosuo@...il.com>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License as published by the Free
> + * Software Foundation; either version 2 of the License, or (at your option)
> + * any later version.
> + */
> +
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/string.h>
> +#include <linux/errno.h>
> +#include <linux/skbuff.h>
> +#include <linux/rtnetlink.h>
> +#include <linux/module.h>
> +#include <linux/init.h>
> +#include <linux/gfp.h>
> +#include <net/net_namespace.h>
> +#include <net/netlink.h>
> +#include <net/pkt_sched.h>
> +#include <net/tcp.h>
> +#include <net/udp.h>
> +#include <linux/tc_act/tc_cpu.h>
> +#include <net/tc_act/tc_cpu.h>
> +
> +#define CPU_TAB_MASK 15
> +static struct tcf_common *tcf_cpu_ht[CPU_TAB_MASK + 1];
> +static u32 cpu_idx_gen;
> +static DEFINE_RWLOCK(cpu_lock);
> +
> +static struct tcf_hashinfo cpu_hash_info = {
> + .htab = tcf_cpu_ht,
> + .hmask = CPU_TAB_MASK,
> + .lock = &cpu_lock
> +};
> +
> +static const struct nla_policy cpu_policy[TCA_CPU_MAX + 1] = {
> + [TCA_CPU_PARMS] = { .len = sizeof(struct tc_cpu) },
> +};
> +
> +static int tcf_cpu_init(struct nlattr *nla, struct nlattr *est,
> + struct tc_action *a, int ovr, int bind)
> +{
> + struct nlattr *tb[TCA_CPU_MAX + 1];
> + struct tc_cpu *parm;
> + struct tcf_cpu *p;
> + struct tcf_common *pc;
> + int ret;
> +
> + if (nla == NULL)
> + return -EINVAL;
> + ret = nla_parse_nested(tb, TCA_CPU_MAX, nla, cpu_policy);
> + if (ret < 0)
> + return ret;
> + if (tb[TCA_CPU_PARMS] == NULL)
> + return -EINVAL;
> + parm = nla_data(tb[TCA_CPU_PARMS]);
> + if (parm->type > TCA_CPU_TYPE_MAX || parm->action != TC_ACT_STOLEN)
> + return -EINVAL;
> +
> + pc = tcf_hash_check(parm->index, a, bind, &cpu_hash_info);
> + if (!pc) {
> + pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
> + &cpu_idx_gen, &cpu_hash_info);
> + if (IS_ERR(pc))
> + return PTR_ERR(pc);
> + ret = ACT_P_CREATED;
> + } else {
> + if (!ovr) {
> + tcf_hash_release(pc, bind, &cpu_hash_info);
> + return -EEXIST;
> + }
> + ret = 0;
> + }
> + p = to_tcf_cpu(pc);
> +
> + spin_lock_bh(&p->tcf_lock);
> + p->type = parm->type;
> + p->value = parm->value;
> + p->tcf_action = parm->action;
> + spin_unlock_bh(&p->tcf_lock);
> +
> + if (ret == ACT_P_CREATED)
> + tcf_hash_insert(pc, &cpu_hash_info);
> +
> + return ret;
> +}
> +
> +static int tcf_cpu_cleanup(struct tc_action *a, int bind)
> +{
> + struct tcf_cpu *p = a->priv;
> +
> + return tcf_hash_release(&p->common, bind, &cpu_hash_info);
> +}
> +
> +static int tcf_cpu_from_sock(struct sk_buff *skb)
> +{
> + struct iphdr *iph;
> + struct sock *sk;
> + struct {
> + __be16 source, dest;
> + } *ports;
> + int cpu;
> +
> + if (skb->dev == NULL || skb->protocol != __constant_htons(ETH_P_IP))
> + goto err;
> + if (!pskb_may_pull(skb, sizeof(*iph)))
> + goto err;
> + iph = (struct iphdr *) skb->data;
> + if (!pskb_may_pull(skb, iph->ihl * 4 + 4))
> + goto err;
> + ports = (void *) (skb->data + iph->ihl * 4);
Why doing the search again, in case skb->sk already set by another
module before you, like tproxy ?
> + switch (iph->protocol) {
> + case IPPROTO_TCP:
> + sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr,
> + ports->source, iph->daddr, ports->dest,
> + skb->dev->ifindex);
> + break;
> + case IPPROTO_UDP:
> + sk = udp4_lib_lookup(dev_net(skb->dev), iph->saddr,
> + ports->source, iph->daddr, ports->dest,
> + skb->dev->ifindex);
> + break;
> + default:
> + goto err;
> + }
> +
> + if (!sk)
> + goto err;
> + cpu = sk->sk_cpu;
> + if (sk->sk_protocol == IPPROTO_TCP && sk->sk_state == TCP_TIME_WAIT)
> + inet_twsk_put(inet_twsk(sk));
> + else
> + sock_put(sk);
> +
> + return cpu;
> +
> +err:
> + return smp_processor_id();
> +}
> +
> +static int tcf_cpu(struct sk_buff *skb, struct tc_action *a,
> + struct tcf_result *res)
> +{
> + struct tcf_cpu *p = a->priv;
> + u32 type;
> + u32 value;
> + int cpu, action;
> + struct sk_buff *nskb;
> + unsigned int qtail;
> +
> + spin_lock(&p->tcf_lock);
Ok, the big lock...
We have a lockless TCP/UDP input path, and this modules adds a lock
again.
> + p->tcf_tm.lastuse = jiffies;
> + p->tcf_bstats.bytes += qdisc_pkt_len(skb);
> + p->tcf_bstats.packets++;
> + type = p->type;
> + value = p->value;
> + action = p->tcf_action;
> + spin_unlock(&p->tcf_lock);
> +
Please change all this crap (legacy crap, copied from other tc
modules), by modern one, using RCU and no locking in hot path.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists