netdev - [RFC V3 PATCH 20/26] net/netpolicy: set Rx queues according to policy

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1473692159-4017-21-git-send-email-kan.liang@intel.com>
Date:   Mon, 12 Sep 2016 07:55:53 -0700
From:   kan.liang@...el.com
To:     davem@...emloft.net, linux-kernel@...r.kernel.org,
        netdev@...r.kernel.org
Cc:     jeffrey.t.kirsher@...el.com, mingo@...hat.com,
        peterz@...radead.org, kuznet@....inr.ac.ru, jmorris@...ei.org,
        yoshfuji@...ux-ipv6.org, kaber@...sh.net,
        akpm@...ux-foundation.org, keescook@...omium.org,
        viro@...iv.linux.org.uk, gorcunov@...nvz.org,
        john.stultz@...aro.org, aduyck@...antis.com, ben@...adent.org.uk,
        decot@...glers.com, fw@...len.de, alexander.duyck@...il.com,
        daniel@...earbox.net, tom@...bertland.com, rdunlap@...radead.org,
        xiyou.wangcong@...il.com, hannes@...essinduktion.org,
        stephen@...workplumber.org, alexei.starovoitov@...il.com,
        jesse.brandeburg@...el.com, andi@...stfloor.org,
        Kan Liang <kan.liang@...el.com>
Subject: [RFC V3 PATCH 20/26] net/netpolicy: set Rx queues according to policy

From: Kan Liang <kan.liang@...el.com>

For setting Rx queues, this patch configure Rx network flow
classification rules to redirect the packets to the assigned queue.

Since we may not get all the information required for rule until the
first packet arrived, it will add the rule after recvmsg. Also, to
avoid destroying the connection rates, the configuration will be done
asynchronized by work queue. So the first several packets may not use
the assigned queue.

The dev information will be discarded in udp_queue_rcv_skb, so we record
it in netpolicy struct in advance.

This patch only support INET tcp4 and udp4. It can be extend to other
socket type and V6 later shortly.

For each sk, it only supports one rule. If the port/address changed, the
previos rule will be replaced.

Signed-off-by: Kan Liang <kan.liang@...el.com>
---
 include/linux/netpolicy.h |  30 +++++++++++
 net/core/netpolicy.c      | 132 +++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv4/af_inet.c        |  71 +++++++++++++++++++++++++
 net/ipv4/udp.c            |   4 ++
 4 files changed, 236 insertions(+), 1 deletion(-)

diff --git a/include/linux/netpolicy.h b/include/linux/netpolicy.h
index e06b74c..04cd07d 100644
--- a/include/linux/netpolicy.h
+++ b/include/linux/netpolicy.h
@@ -37,6 +37,8 @@ enum netpolicy_traffic {
 	NETPOLICY_RXTX,
 };
 
+#define NETPOLICY_INVALID_QUEUE	-1
+#define NETPOLICY_INVALID_LOC	NETPOLICY_INVALID_QUEUE
 #define POLICY_NAME_LEN_MAX	64
 extern const char *policy_name[];
 
@@ -81,11 +83,34 @@ struct netpolicy_info {
 	struct list_head	obj_list[NETPOLICY_RXTX][NET_POLICY_MAX];
 };
 
+struct netpolicy_tcpudpip4_spec {
+	/* source and Destination host and port */
+	__be32	ip4src;
+	__be32	ip4dst;
+	__be16	psrc;
+	__be16	pdst;
+};
+
+union netpolicy_flow_union {
+	struct netpolicy_tcpudpip4_spec	tcp_udp_ip4_spec;
+};
+
+struct netpolicy_flow_spec {
+	__u32	flow_type;
+	union netpolicy_flow_union	spec;
+};
+
 struct netpolicy_instance {
 	struct net_device	*dev;
 	enum netpolicy_name	policy; /* required policy */
 	void			*ptr;   /* pointers */
 	struct task_struct	*task;
+	int			location;	/* rule location */
+	atomic_t		rule_queue;	/* queue set by rule */
+	struct work_struct	fc_wk;		/* flow classification work */
+	atomic_t		fc_wk_cnt;	/* flow classification work number */
+	struct netpolicy_flow_spec	flow;	/* flow information */
+
 };
 
 struct netpolicy_cpu_load {
@@ -106,6 +131,7 @@ extern int netpolicy_register(struct netpolicy_instance *instance,
 			      enum netpolicy_name policy);
 extern void netpolicy_unregister(struct netpolicy_instance *instance);
 extern int netpolicy_pick_queue(struct netpolicy_instance *instance, bool is_rx);
+extern void netpolicy_set_rules(struct netpolicy_instance *instance);
 #else
 static inline void update_netpolicy_sys_map(void)
 {
@@ -124,6 +150,10 @@ static inline int netpolicy_pick_queue(struct netpolicy_instance *instance, bool
 {
 	return 0;
 }
+
+static inline void netpolicy_set_rules(struct netpolicy_instance *instance)
+{
+}
 #endif
 
 #endif /*__LINUX_NETPOLICY_H*/
diff --git a/net/core/netpolicy.c b/net/core/netpolicy.c
index e82e0d3..252cbee 100644
--- a/net/core/netpolicy.c
+++ b/net/core/netpolicy.c
@@ -54,6 +54,8 @@ struct netpolicy_record {
 static DEFINE_HASHTABLE(np_record_hash, 10);
 static DEFINE_SPINLOCK(np_hashtable_lock);
 
+struct workqueue_struct *np_fc_wq;
+
 static int netpolicy_get_dev_info(struct net_device *dev,
 				  struct netpolicy_dev_info *d_info)
 {
@@ -472,6 +474,90 @@ int netpolicy_pick_queue(struct netpolicy_instance *instance, bool is_rx)
 }
 EXPORT_SYMBOL(netpolicy_pick_queue);
 
+void np_flow_rule_set(struct work_struct *wk)
+{
+	struct netpolicy_instance *instance;
+	struct netpolicy_flow_spec *flow;
+	struct ethtool_rxnfc cmd;
+	struct net_device *dev;
+	int queue, ret;
+
+	instance = container_of(wk, struct netpolicy_instance,
+				fc_wk);
+	if (!instance)
+		goto done;
+
+	flow = &instance->flow;
+	if (WARN_ON(!flow))
+		goto done;
+	dev = instance->dev;
+	if (WARN_ON(!dev))
+		goto done;
+
+	/* Check if ntuple is supported */
+	if (!dev->ethtool_ops->set_rxnfc)
+		goto done;
+
+	/* Only support TCP/UDP V4 by now */
+	if ((flow->flow_type != TCP_V4_FLOW) &&
+	    (flow->flow_type != UDP_V4_FLOW))
+		goto done;
+
+	queue = get_avail_queue(instance, true);
+	if (queue < 0)
+		goto done;
+
+	/* using ethtool flow-type to configure
+	 * Rx network flow classification options or rules
+	 * RX_CLS_LOC_ANY must be supported by the driver
+	 */
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.cmd = ETHTOOL_SRXCLSRLINS;
+	cmd.fs.flow_type = flow->flow_type;
+	cmd.fs.h_u.tcp_ip4_spec.ip4src = flow->spec.tcp_udp_ip4_spec.ip4src;
+	cmd.fs.h_u.tcp_ip4_spec.psrc = flow->spec.tcp_udp_ip4_spec.psrc;
+	cmd.fs.h_u.tcp_ip4_spec.ip4dst = flow->spec.tcp_udp_ip4_spec.ip4dst;
+	cmd.fs.h_u.tcp_ip4_spec.pdst = flow->spec.tcp_udp_ip4_spec.pdst;
+	cmd.fs.ring_cookie = queue;
+	cmd.fs.location = RX_CLS_LOC_ANY;
+	rtnl_lock();
+	ret = dev->ethtool_ops->set_rxnfc(dev, &cmd);
+	rtnl_unlock();
+	if (ret < 0) {
+		pr_warn("Failed to set rules ret %d\n", ret);
+		atomic_set(&instance->rule_queue, NETPOLICY_INVALID_QUEUE);
+		goto done;
+	}
+
+	/* TODO: now one sk only has one rule */
+	if (instance->location != NETPOLICY_INVALID_LOC) {
+		/* delete the old rule */
+		struct ethtool_rxnfc del_cmd;
+
+		del_cmd.cmd = ETHTOOL_SRXCLSRLDEL;
+		del_cmd.fs.location = instance->location;
+		rtnl_lock();
+		ret = dev->ethtool_ops->set_rxnfc(dev, &del_cmd);
+		rtnl_unlock();
+		if (ret < 0)
+			pr_warn("Failed to delete rules ret %d\n", ret);
+	}
+
+	/* record rule location */
+	instance->location = cmd.fs.location;
+	atomic_set(&instance->rule_queue, queue);
+done:
+	atomic_set(&instance->fc_wk_cnt, 0);
+}
+
+static void init_instance(struct netpolicy_instance *instance)
+{
+	instance->location = NETPOLICY_INVALID_LOC;
+	atomic_set(&instance->rule_queue, NETPOLICY_INVALID_QUEUE);
+	atomic_set(&instance->fc_wk_cnt, 0);
+	INIT_WORK(&instance->fc_wk, np_flow_rule_set);
+}
+
 /**
  * netpolicy_register() - Register per socket/task policy request
  * @instance:	NET policy per socket/task instance info
@@ -516,6 +602,7 @@ int netpolicy_register(struct netpolicy_instance *instance,
 		}
 		kfree(new);
 	} else {
+		init_instance(instance);
 		new->ptr_id = ptr_id;
 		new->dev = instance->dev;
 		new->policy = policy;
@@ -538,8 +625,23 @@ EXPORT_SYMBOL(netpolicy_register);
  */
 void netpolicy_unregister(struct netpolicy_instance *instance)
 {
-	struct netpolicy_record *record;
 	unsigned long ptr_id = (uintptr_t)instance->ptr;
+	struct net_device *dev = instance->dev;
+	struct netpolicy_record *record;
+
+	cancel_work_sync(&instance->fc_wk);
+	/* remove FD rules */
+	if (dev && instance->location != NETPOLICY_INVALID_LOC) {
+		struct ethtool_rxnfc del_cmd;
+
+		del_cmd.cmd = ETHTOOL_SRXCLSRLDEL;
+		del_cmd.fs.location = instance->location;
+		rtnl_lock();
+		dev->ethtool_ops->set_rxnfc(dev, &del_cmd);
+		rtnl_unlock();
+		instance->location = NETPOLICY_INVALID_LOC;
+		atomic_set(&instance->rule_queue, NETPOLICY_INVALID_QUEUE);
+	}
 
 	spin_lock_bh(&np_hashtable_lock);
 	/* del from hash table */
@@ -555,6 +657,28 @@ void netpolicy_unregister(struct netpolicy_instance *instance)
 }
 EXPORT_SYMBOL(netpolicy_unregister);
 
+/**
+ * netpolicy_set_rules() - Configure Rx network flow classification rules
+ * @instance:	NET policy per socket/task instance info
+ *
+ * This function intends to configure Rx network flow classification rules
+ * according to ip and port information. The configuration will be done
+ * asynchronized by work queue. It avoids to destroy the connection rates.
+ *
+ * Currently, it only supports TCP and UDP V4. Other protocols will be
+ * supported later.
+ *
+ */
+void netpolicy_set_rules(struct netpolicy_instance *instance)
+{
+	/* There should be only one work to run at the same time */
+	if (!atomic_cmpxchg(&instance->fc_wk_cnt, 0, 1)) {
+		instance->task = current;
+		queue_work(np_fc_wq, &instance->fc_wk);
+	}
+}
+EXPORT_SYMBOL(netpolicy_set_rules);
+
 const char *policy_name[NET_POLICY_MAX] = {
 	"NONE",
 	"CPU",
@@ -1255,6 +1379,10 @@ static int __init netpolicy_init(void)
 {
 	int ret;
 
+	np_fc_wq = create_workqueue("np_fc");
+	if (!np_fc_wq)
+		return -ENOMEM;
+
 	ret = register_pernet_subsys(&netpolicy_net_ops);
 	if (!ret)
 		register_netdevice_notifier(&netpolicy_dev_notf);
@@ -1268,6 +1396,8 @@ static int __init netpolicy_init(void)
 
 static void __exit netpolicy_exit(void)
 {
+	destroy_workqueue(np_fc_wq);
+
 	unregister_netdevice_notifier(&netpolicy_dev_notf);
 	unregister_pernet_subsys(&netpolicy_net_ops);
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e94b47b..209edc4 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -754,6 +754,71 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
 }
 EXPORT_SYMBOL(inet_sendpage);
 
+static void sock_netpolicy_manage_flow(struct sock *sk, struct msghdr *msg)
+{
+#ifdef CONFIG_NETPOLICY
+	struct netpolicy_instance *instance;
+	struct netpolicy_flow_spec *flow;
+	bool change = false;
+	int queue;
+
+	instance = netpolicy_find_instance(sk);
+	if (!instance)
+		return;
+
+	if (!instance->dev)
+		return;
+
+	flow = &instance->flow;
+	/* TODO: need to change here and add more protocol support */
+	if (sk->sk_family != AF_INET)
+		return;
+	if ((sk->sk_protocol == IPPROTO_TCP) &&
+	    (sk->sk_type == SOCK_STREAM)) {
+		if ((flow->flow_type != TCP_V4_FLOW) ||
+		    (flow->spec.tcp_udp_ip4_spec.ip4src != sk->sk_daddr) ||
+		    (flow->spec.tcp_udp_ip4_spec.psrc != sk->sk_dport) ||
+		    (flow->spec.tcp_udp_ip4_spec.ip4dst != sk->sk_rcv_saddr) ||
+		    (flow->spec.tcp_udp_ip4_spec.pdst != htons(sk->sk_num)))
+			change = true;
+		if (change) {
+			flow->flow_type = TCP_V4_FLOW;
+			flow->spec.tcp_udp_ip4_spec.ip4src = sk->sk_daddr;
+			flow->spec.tcp_udp_ip4_spec.psrc = sk->sk_dport;
+			flow->spec.tcp_udp_ip4_spec.ip4dst = sk->sk_rcv_saddr;
+			flow->spec.tcp_udp_ip4_spec.pdst = htons(sk->sk_num);
+		}
+	} else if ((sk->sk_protocol == IPPROTO_UDP) &&
+		   (sk->sk_type == SOCK_DGRAM)) {
+			DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+
+			if (!sin || !sin->sin_addr.s_addr || !sin->sin_port)
+				return;
+			if ((flow->flow_type != UDP_V4_FLOW) ||
+			    (flow->spec.tcp_udp_ip4_spec.ip4src != sin->sin_addr.s_addr) ||
+			    (flow->spec.tcp_udp_ip4_spec.psrc != sin->sin_port) ||
+			    (flow->spec.tcp_udp_ip4_spec.ip4dst != sk->sk_rcv_saddr) ||
+			    (flow->spec.tcp_udp_ip4_spec.pdst != htons(sk->sk_num)))
+				change = true;
+			if (change) {
+				flow->flow_type = UDP_V4_FLOW;
+				flow->spec.tcp_udp_ip4_spec.ip4src = sin->sin_addr.s_addr;
+				flow->spec.tcp_udp_ip4_spec.psrc = sin->sin_port;
+				flow->spec.tcp_udp_ip4_spec.ip4dst = sk->sk_rcv_saddr;
+				flow->spec.tcp_udp_ip4_spec.pdst = htons(sk->sk_num);
+			}
+	} else {
+		return;
+	}
+
+	queue = netpolicy_pick_queue(instance, true);
+	if (queue < 0)
+		return;
+	if ((queue != atomic_read(&instance->rule_queue)) || change)
+		netpolicy_set_rules(instance);
+#endif
+}
+
 int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 		 int flags)
 {
@@ -767,6 +832,12 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 				   flags & ~MSG_DONTWAIT, &addr_len);
 	if (err >= 0)
 		msg->msg_namelen = addr_len;
+
+	/* The dev info, src address and port information for UDP
+	 * can only be retrieved after processing the msg.
+	 */
+	sock_netpolicy_manage_flow(sk, msg);
+
 	return err;
 }
 EXPORT_SYMBOL(inet_recvmsg);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 058c312..cc2499d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1786,6 +1786,10 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 	if (sk) {
 		int ret;
 
+#ifdef CONFIG_NETPOLICY
+		/* Record dev info before it's discarded in udp_queue_rcv_skb */
+		sk->sk_netpolicy.dev = skb->dev;
+#endif
 		if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
 			skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
 						 inet_compute_pseudo);
-- 
2.5.5