lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Tue, 7 Jul 2020 13:40:24 +0300
From:   Maxim Mikityanskiy <maximmi@...lanox.com>
To:     netdev@...r.kernel.org
Cc:     Yossi Kuperman <yossiku@...lanox.com>,
        Jamal Hadi Salim <jhs@...atatu.com>,
        Cong Wang <xiyou.wangcong@...il.com>,
        John Fastabend <john.fastabend@...il.com>,
        Toke Høiland-Jørgensen <toke@...hat.com>,
        Dave Taht <dave.taht@...il.com>,
        Jiri Pirko <jiri@...lanox.com>,
        Rony Efraim <ronye@...lanox.com>,
        Eran Ben Elisha <eranbe@...lanox.com>
Subject: Re: [RFC PATCH] sch_htb: Hierarchical QoS hardware offload

Comments are very welcome before I move to the full implementation.

On 2020-06-26 13:46, Maxim Mikityanskiy wrote:
> This patch is a follow-up for the RFC posted earlier [1]. You can find a
> detailed description of the motivation there, and this patch is RFC code
> that shows possible implementation. The main changes are in
> net/sched/sch_htb.c file, and this patch also contains stubs for mlx5
> that show the driver API.
> 
> HTB doesn't scale well because of contention on a single lock, and it
> also consumes CPU. Mellanox hardware supports hierarchical rate limiting
> that can be leveraged by offloading the functionality of HTB.
> 
> Our solution addresses two problems of HTB:
> 
> 1. Contention by flow classification. Currently the filters are attached
> to the HTB instance as follows:
> 
>      # tc filter add dev eth0 parent 1:0 protocol ip flower dst_port 80
>      classid 1:10
> 
> It's possible to move classification to clsact egress hook, which is
> thread-safe and lock-free:
> 
>      # tc filter add dev eth0 egress protocol ip flower dst_port 80
>      action skbedit priority 1:10
> 
> This way classification still happens in software, but the lock
> contention is eliminated, and it happens before selecting the TX queue,
> allowing the driver to translate the class to the corresponding hardware
> queue.
> 
> Note that this is already compatible with non-offloaded HTB and doesn't
> require changes to the kernel nor iproute2.
> 
> 2. Contention by handling packets. HTB is not multi-queue, it attaches
> to a whole net device, and handling of all packets takes the same lock.
> Our solution offloads the logic of HTB to the hardware and registers HTB
> as a multi-queue qdisc, similarly to how mq qdisc does, i.e. HTB is
> attached to the netdev, and each queue has its own qdisc. The control
> flow is performed by HTB, it replicates the hierarchy of classes in
> hardware by calling callbacks of the driver. Leaf classes are presented
> by hardware queues. The data path works as follows: a packet is
> classified by clsact, the driver selectes the hardware queue according
> to its class, and the packet is enqueued into this queue's qdisc.
> 
> I'm looking forward to hearing feedback on such implementation. All
> feedback will be much appreciated.
> 
> Thanks,
> Max
> 
> [1]: https://www.spinics.net/lists/netdev/msg628422.html
> 
> Signed-off-by: Maxim Mikityanskiy <maximmi@...lanox.com>
> ---
>   drivers/net/ethernet/mellanox/mlx5/core/en.h  |   1 +
>   .../net/ethernet/mellanox/mlx5/core/en_main.c |  38 ++-
>   .../net/ethernet/mellanox/mlx5/core/en_tx.c   |  12 +
>   include/linux/netdevice.h                     |   1 +
>   include/net/pkt_cls.h                         |  12 +
>   include/net/sch_generic.h                     |  14 +-
>   include/uapi/linux/pkt_sched.h                |   1 +
>   net/sched/sch_htb.c                           | 262 +++++++++++++++++-
>   tools/include/uapi/linux/pkt_sched.h          |   1 +
>   9 files changed, 320 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
> index 3dad3fe03e7e..23841f6682f6 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
> @@ -70,6 +70,7 @@ struct page_pool;
>   #define MLX5E_SW2HW_MTU(params, swmtu) ((swmtu) + ((params)->hard_mtu))
>   
>   #define MLX5E_MAX_NUM_TC	8
> +#define MLX5E_MAX_NUM_HTB_QUEUES 1024
>   
>   #define MLX5_RX_HEADROOM NET_SKB_PAD
>   #define MLX5_SKB_FRAG_SZ(len)	(SKB_DATA_ALIGN(len) +	\
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> index 88ae81ba03fc..56803833643d 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> @@ -3562,6 +3562,26 @@ static int mlx5e_setup_tc_mqprio(struct mlx5e_priv *priv,
>   	return err;
>   }
>   
> +static int mlx5e_setup_tc_htb(struct mlx5e_priv *priv, struct tc_htb_qopt_offload *htb)
> +{
> +	switch (htb->command) {
> +	case TC_HTB_CREATE:
> +		pr_info("%s: create\n", __func__);
> +		return 0;
> +	case TC_HTB_DESTROY:
> +		pr_info("%s: destroy\n", __func__);
> +		return 0;
> +	case TC_HTB_NEW_LEAF:
> +		pr_info("%s: new leaf %lx\n", __func__, htb->classid);
> +		return 0;
> +	case TC_HTB_DEL_LEAF:
> +		pr_info("%s: del leaf %lx\n", __func__, htb->classid);
> +		return 0;
> +	default:
> +		return -EOPNOTSUPP;
> +	}
> +}
> +
>   static LIST_HEAD(mlx5e_block_cb_list);
>   
>   static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
> @@ -3581,6 +3601,8 @@ static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
>   	}
>   	case TC_SETUP_QDISC_MQPRIO:
>   		return mlx5e_setup_tc_mqprio(priv, type_data);
> +	case TC_SETUP_QDISC_HTB:
> +		return mlx5e_setup_tc_htb(priv, type_data);
>   	default:
>   		return -EOPNOTSUPP;
>   	}
> @@ -3751,20 +3773,22 @@ static int set_feature_cvlan_filter(struct net_device *netdev, bool enable)
>   	return 0;
>   }
>   
> -#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
> -static int set_feature_tc_num_filters(struct net_device *netdev, bool enable)
> +static int set_feature_hw_tc(struct net_device *netdev, bool enable)
>   {
>   	struct mlx5e_priv *priv = netdev_priv(netdev);
>   
> +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
>   	if (!enable && mlx5e_tc_num_filters(priv, MLX5_TC_FLAG(NIC_OFFLOAD))) {
>   		netdev_err(netdev,
>   			   "Active offloaded tc filters, can't turn hw_tc_offload off\n");
>   		return -EINVAL;
>   	}
> +#endif
> +
> +	// TODO: HTB offload.
>   
>   	return 0;
>   }
> -#endif
>   
>   static int set_feature_rx_all(struct net_device *netdev, bool enable)
>   {
> @@ -3862,9 +3886,7 @@ int mlx5e_set_features(struct net_device *netdev, netdev_features_t features)
>   	err |= MLX5E_HANDLE_FEATURE(NETIF_F_LRO, set_feature_lro);
>   	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_FILTER,
>   				    set_feature_cvlan_filter);
> -#if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
> -	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_tc_num_filters);
> -#endif
> +	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_hw_tc);
>   	err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXALL, set_feature_rx_all);
>   	err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXFCS, set_feature_rx_fcs);
>   	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_RX, set_feature_rx_vlan);
> @@ -5018,6 +5040,8 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
>   		netdev->hw_features	 |= NETIF_F_NTUPLE;
>   #endif
>   	}
> +	// TODO: HTB offload.
> +	netdev->features |= NETIF_F_HW_TC;
>   
>   	netdev->features         |= NETIF_F_HIGHDMA;
>   	netdev->features         |= NETIF_F_HW_VLAN_STAG_FILTER;
> @@ -5353,7 +5377,7 @@ struct net_device *mlx5e_create_netdev(struct mlx5_core_dev *mdev,
>   	int err;
>   
>   	netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv),
> -				    nch * profile->max_tc,
> +				    nch * profile->max_tc + MLX5E_MAX_NUM_HTB_QUEUES,
>   				    nch * profile->rq_groups);
>   	if (!netdev) {
>   		mlx5_core_err(mdev, "alloc_etherdev_mqs() failed\n");
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
> index 320946195687..434b482a3dd3 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
> @@ -74,6 +74,18 @@ u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
>   	int up = 0;
>   	int ch_ix;
>   
> +	// TODO: Map unset priority to the standard algorithm.
> +	// TODO: Map priority (==class) to queue and select it.
> +	if (skb->priority) {
> +		// TODO: Check that TC_H_MAJ corresponds to the HTB offloaded qdisc.
> +		unsigned long class = TC_H_MIN(skb->priority);
> +
> +		// TODO: Map to queue and return it.
> +		// TODO: False positives with MQ.
> +		netdev_info(dev, "TX from class %lu\n", class);
> +		return 0;
> +	}
> +
>   	if (!netdev_get_num_tc(dev))
>   		return txq_ix;
>   
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 6fc613ed8eae..7227d988576d 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -859,6 +859,7 @@ enum tc_setup_type {
>   	TC_SETUP_QDISC_ETS,
>   	TC_SETUP_QDISC_TBF,
>   	TC_SETUP_QDISC_FIFO,
> +	TC_SETUP_QDISC_HTB,
>   };
>   
>   /* These structures hold the attributes of bpf state that are being passed
> diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
> index ff017e5b3ea2..97523ca54c6b 100644
> --- a/include/net/pkt_cls.h
> +++ b/include/net/pkt_cls.h
> @@ -734,6 +734,18 @@ struct tc_mq_qopt_offload {
>   	};
>   };
>   
> +enum tc_htb_command {
> +	TC_HTB_CREATE,
> +	TC_HTB_DESTROY,
> +	TC_HTB_NEW_LEAF,
> +	TC_HTB_DEL_LEAF,
> +};
> +
> +struct tc_htb_qopt_offload {
> +	enum tc_htb_command command;
> +	unsigned long classid;
> +};
> +
>   enum tc_red_command {
>   	TC_RED_REPLACE,
>   	TC_RED_DESTROY,
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index c510b03b9751..73162736dcfe 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -559,14 +559,20 @@ static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc)
>   	return qdisc->dev_queue->dev;
>   }
>   
> -static inline void sch_tree_lock(const struct Qdisc *q)
> +static inline void sch_tree_lock(struct Qdisc *q)
>   {
> -	spin_lock_bh(qdisc_root_sleeping_lock(q));
> +	if (q->flags & TCQ_F_MQROOT)
> +		spin_lock_bh(qdisc_lock(q));
> +	else
> +		spin_lock_bh(qdisc_root_sleeping_lock(q));
>   }
>   
> -static inline void sch_tree_unlock(const struct Qdisc *q)
> +static inline void sch_tree_unlock(struct Qdisc *q)
>   {
> -	spin_unlock_bh(qdisc_root_sleeping_lock(q));
> +	if (q->flags & TCQ_F_MQROOT)
> +		spin_unlock_bh(qdisc_lock(q));
> +	else
> +		spin_unlock_bh(qdisc_root_sleeping_lock(q));
>   }
>   
>   extern struct Qdisc noop_qdisc;
> diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> index a95f3ae7ab37..906e19ed2d53 100644
> --- a/include/uapi/linux/pkt_sched.h
> +++ b/include/uapi/linux/pkt_sched.h
> @@ -432,6 +432,7 @@ enum {
>   	TCA_HTB_RATE64,
>   	TCA_HTB_CEIL64,
>   	TCA_HTB_PAD,
> +	TCA_HTB_OFFLOAD,
>   	__TCA_HTB_MAX,
>   };
>   
> diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
> index 8184c87da8be..1d82afe96f8e 100644
> --- a/net/sched/sch_htb.c
> +++ b/net/sched/sch_htb.c
> @@ -174,6 +174,10 @@ struct htb_sched {
>   	int			row_mask[TC_HTB_MAXDEPTH];
>   
>   	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
> +
> +	struct Qdisc		**direct_qdiscs;
> +
> +	bool			offload;
>   };
>   
>   /* find class in global hash table using given handle */
> @@ -980,6 +984,7 @@ static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
>   	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
>   	[TCA_HTB_RATE64] = { .type = NLA_U64 },
>   	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
> +	[TCA_HTB_OFFLOAD] = { .type = NLA_FLAG },
>   };
>   
>   static void htb_work_func(struct work_struct *work)
> @@ -992,13 +997,23 @@ static void htb_work_func(struct work_struct *work)
>   	rcu_read_unlock();
>   }
>   
> +static void htb_set_lockdep_class_child(struct Qdisc *q)
> +{
> +	static struct lock_class_key child_key;
> +
> +	lockdep_set_class(qdisc_lock(q), &child_key);
> +}
> +
>   static int htb_init(struct Qdisc *sch, struct nlattr *opt,
>   		    struct netlink_ext_ack *extack)
>   {
> +	struct net_device *dev = qdisc_dev(sch);
>   	struct htb_sched *q = qdisc_priv(sch);
>   	struct nlattr *tb[TCA_HTB_MAX + 1];
>   	struct tc_htb_glob *gopt;
> +	unsigned int ntx;
>   	int err;
> +	struct tc_htb_qopt_offload htb_offload;
>   
>   	qdisc_watchdog_init(&q->watchdog, sch);
>   	INIT_WORK(&q->work, htb_work_func);
> @@ -1022,9 +1037,22 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
>   	if (gopt->version != HTB_VER >> 16)
>   		return -EINVAL;
>   
> +	q->offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]);
> +	q->offload = true;
> +
> +	if (q->offload) {
> +		if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
> +			return -EOPNOTSUPP;
> +
> +		q->direct_qdiscs = kcalloc(dev->real_num_tx_queues,
> +					   sizeof(*q->direct_qdiscs), GFP_KERNEL);
> +		if (!q->direct_qdiscs)
> +			return -ENOMEM;
> +	}
> +
>   	err = qdisc_class_hash_init(&q->clhash);
>   	if (err < 0)
> -		return err;
> +		goto err_free_direct_qdiscs;
>   
>   	qdisc_skb_head_init(&q->direct_queue);
>   
> @@ -1037,7 +1065,92 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
>   		q->rate2quantum = 1;
>   	q->defcls = gopt->defcls;
>   
> +	if (!q->offload)
> +		return 0;
> +
> +	for (ntx = 0; ntx < dev->real_num_tx_queues; ntx++) {
> +		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
> +		struct Qdisc *qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
> +							TC_H_MAKE(sch->handle, ntx + 1),
> +							extack);
> +
> +		if (!qdisc) {
> +			err = -ENOMEM;
> +			goto err_free_qdiscs;
> +		}
> +
> +		htb_set_lockdep_class_child(qdisc);
> +		q->direct_qdiscs[ntx] = qdisc;
> +		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
> +	}
> +
> +	sch->flags |= TCQ_F_MQROOT;
> +
> +	htb_offload.command = TC_HTB_CREATE;
> +	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, &htb_offload);
> +
>   	return 0;
> +
> +err_free_qdiscs:
> +	for (ntx = 0; ntx < dev->real_num_tx_queues && q->direct_qdiscs[ntx]; ntx++)
> +		qdisc_put(q->direct_qdiscs[ntx]);
> +
> +	qdisc_class_hash_destroy(&q->clhash);
> +
> +err_free_direct_qdiscs:
> +	kfree(q->direct_qdiscs);
> +	return err;
> +}
> +
> +static void htb_attach_offload(struct Qdisc *sch)
> +{
> +	struct net_device *dev = qdisc_dev(sch);
> +	struct htb_sched *q = qdisc_priv(sch);
> +	unsigned int ntx;
> +
> +	// TODO: real_num_tx_queues might have changed.
> +	for (ntx = 0; ntx < dev->real_num_tx_queues; ntx++) {
> +		struct Qdisc *qdisc = q->direct_qdiscs[ntx];
> +		struct Qdisc *old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
> +
> +		qdisc_put(old);
> +		qdisc_hash_add(qdisc, false);
> +	}
> +	for (ntx = dev->real_num_tx_queues; ntx < dev->num_tx_queues; ntx++) {
> +		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
> +		struct Qdisc *old = dev_graft_qdisc(dev_queue, NULL);
> +
> +		qdisc_put(old);
> +	}
> +
> +	kfree(q->direct_qdiscs);
> +	q->direct_qdiscs = NULL;
> +}
> +
> +static void htb_attach_software(struct Qdisc *sch)
> +{
> +	struct net_device *dev = qdisc_dev(sch);
> +	unsigned int ntx;
> +
> +	/* Resemble qdisc_graft behavior. */
> +	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
> +		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
> +		struct Qdisc *old = dev_graft_qdisc(dev_queue, sch);
> +
> +		qdisc_refcount_inc(sch);
> +
> +		qdisc_put(old);
> +	}
> +}
> +
> +static void htb_attach(struct Qdisc *sch)
> +{
> +	struct htb_sched *q = qdisc_priv(sch);
> +
> +	if (q->offload)
> +		htb_attach_offload(sch);
> +	else
> +		htb_attach_software(sch);
>   }
>   
>   static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
> @@ -1144,19 +1257,56 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
>   	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
>   }
>   
> +static struct netdev_queue *htb_offload_class_to_queue(struct Qdisc *sch, unsigned long classid)
> +{
> +	struct net_device *dev = qdisc_dev(sch);
> +	unsigned long ntx = TC_H_MIN(classid) - 1;
> +
> +	if (ntx >= dev->num_tx_queues)
> +		return NULL;
> +	return netdev_get_tx_queue(dev, ntx);
> +}
> +
> +static struct netdev_queue *htb_select_queue(struct Qdisc *sch, struct tcmsg *tcm)
> +{
> +	return htb_offload_class_to_queue(sch, tcm->tcm_parent);
> +}
> +
>   static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
>   		     struct Qdisc **old, struct netlink_ext_ack *extack)
>   {
>   	struct htb_class *cl = (struct htb_class *)arg;
> +	struct netdev_queue *dev_queue = sch->dev_queue;
> +	struct htb_sched *q = qdisc_priv(sch);
> +	struct Qdisc *old_q;
>   
>   	if (cl->level)
>   		return -EINVAL;
> -	if (new == NULL &&
> -	    (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
> -				     cl->common.classid, extack)) == NULL)
> +
> +	if (q->offload)
> +		dev_queue = htb_offload_class_to_queue(sch, cl->common.classid);
> +
> +	if (!new)
> +		new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
> +					cl->common.classid, extack);
> +
> +	if (!new)
>   		return -ENOBUFS;
>   
> +	if (q->offload) {
> +		htb_set_lockdep_class_child(new);
> +		/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
> +		qdisc_refcount_inc(new);
> +		old_q = dev_graft_qdisc(dev_queue, new);
> +	}
> +
>   	*old = qdisc_replace(sch, new, &cl->leaf.q);
> +
> +	if (q->offload) {
> +		WARN_ON(old_q != *old);
> +		qdisc_put(old_q);
> +	}
> +
>   	return 0;
>   }
>   
> @@ -1184,10 +1334,14 @@ static inline int htb_parent_last_child(struct htb_class *cl)
>   	return 1;
>   }
>   
> -static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
> +static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl,
>   			       struct Qdisc *new_q)
>   {
> +	struct htb_sched *q = qdisc_priv(sch);
>   	struct htb_class *parent = cl->parent;
> +	struct netdev_queue *dev_queue;
> +	struct Qdisc *old_q;
> +	struct tc_htb_qopt_offload htb_offload;
>   
>   	WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity);
>   
> @@ -1202,13 +1356,47 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
>   	parent->ctokens = parent->cbuffer;
>   	parent->t_c = ktime_get_ns();
>   	parent->cmode = HTB_CAN_SEND;
> +
> +	if (!q->offload)
> +		return;
> +
> +	dev_queue = htb_offload_class_to_queue(sch, parent->common.classid);
> +	/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
> +	qdisc_refcount_inc(new_q);
> +	old_q = dev_graft_qdisc(dev_queue, new_q);
> +	WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
> +	htb_offload.command = TC_HTB_NEW_LEAF;
> +	htb_offload.classid = parent->common.classid;
> +	// TODO: extack
> +	qdisc_offload_graft_helper(qdisc_dev(sch), sch, new_q, old_q,
> +				   TC_SETUP_QDISC_HTB, &htb_offload, NULL);
>   }
>   
>   static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
>   {
>   	if (!cl->level) {
> -		WARN_ON(!cl->leaf.q);
> -		qdisc_put(cl->leaf.q);
> +		struct htb_sched *priv = qdisc_priv(sch);
> +		struct Qdisc *q = cl->leaf.q;
> +		struct Qdisc *old;
> +		struct tc_htb_qopt_offload htb_offload;
> +
> +		WARN_ON(!q);
> +		if (priv->offload) {
> +			old = dev_graft_qdisc(q->dev_queue, NULL);
> +			/* On destroy of HTB, dev_queue qdiscs are destroyed
> +			 * first, and old is noop_qdisc.
> +			 */
> +			WARN_ON(old != q && !(old->flags & TCQ_F_BUILTIN));
> +
> +			htb_offload.command = TC_HTB_DEL_LEAF;
> +			htb_offload.classid = cl->common.classid;
> +			// TODO: extack
> +			qdisc_offload_graft_helper(qdisc_dev(sch), sch, NULL, q,
> +						   TC_SETUP_QDISC_HTB, &htb_offload, NULL);
> +
> +			qdisc_put(old);
> +		}
> +		qdisc_put(q);
>   	}
>   	gen_kill_estimator(&cl->rate_est);
>   	tcf_block_put(cl->block);
> @@ -1217,10 +1405,12 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
>   
>   static void htb_destroy(struct Qdisc *sch)
>   {
> +	struct net_device *dev = qdisc_dev(sch);
>   	struct htb_sched *q = qdisc_priv(sch);
>   	struct hlist_node *next;
>   	struct htb_class *cl;
>   	unsigned int i;
> +	struct tc_htb_qopt_offload htb_offload;
>   
>   	cancel_work_sync(&q->work);
>   	qdisc_watchdog_cancel(&q->watchdog);
> @@ -1244,6 +1434,21 @@ static void htb_destroy(struct Qdisc *sch)
>   	}
>   	qdisc_class_hash_destroy(&q->clhash);
>   	__qdisc_reset_queue(&q->direct_queue);
> +
> +	if (!q->offload)
> +		return;
> +
> +	if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc) {
> +		htb_offload.command = TC_HTB_DESTROY;
> +		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, &htb_offload);
> +	}
> +
> +	if (!q->direct_qdiscs)
> +		return;
> +	// TODO: real_num_tx_queues might have changed.
> +	for (i = 0; i < dev->real_num_tx_queues && q->direct_qdiscs[i]; i++)
> +		qdisc_put(q->direct_qdiscs[i]);
> +	kfree(q->direct_qdiscs);
>   }
>   
>   static int htb_delete(struct Qdisc *sch, unsigned long arg)
> @@ -1265,6 +1470,8 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
>   					  cl->parent->common.classid,
>   					  NULL);
>   		last_child = 1;
> +		if (q->offload)
> +			htb_set_lockdep_class_child(new_q);
>   	}
>   
>   	sch_tree_lock(sch);
> @@ -1285,7 +1492,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
>   				  &q->hlevel[cl->level].wait_pq);
>   
>   	if (last_child)
> -		htb_parent_to_leaf(q, cl, new_q);
> +		htb_parent_to_leaf(sch, cl, new_q);
>   
>   	sch_tree_unlock(sch);
>   
> @@ -1306,6 +1513,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
>   	struct tc_htb_opt *hopt;
>   	u64 rate64, ceil64;
>   	int warn = 0;
> +	struct netdev_queue *dev_queue;
> +	struct tc_htb_qopt_offload htb_offload;
>   
>   	/* extract all subattrs from opt attr */
>   	if (!opt)
> @@ -1336,7 +1545,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
>   					      NULL));
>   
>   	if (!cl) {		/* new class */
> -		struct Qdisc *new_q;
> +		struct Qdisc *new_q, *old_q;
>   		int prio;
>   		struct {
>   			struct nlattr		nla;
> @@ -1392,13 +1601,29 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
>   		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
>   			RB_CLEAR_NODE(&cl->node[prio]);
>   
> +		cl->common.classid = classid;
> +
>   		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
>   		 * so that can't be used inside of sch_tree_lock
>   		 * -- thanks to Karlis Peisenieks
>   		 */
> -		new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
> +		dev_queue = q->offload ? htb_offload_class_to_queue(sch, cl->common.classid) :
> +					 sch->dev_queue;
> +		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
>   					  classid, NULL);
>   		sch_tree_lock(sch);
> +		if (q->offload) {
> +			htb_set_lockdep_class_child(new_q);
> +			/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
> +			qdisc_refcount_inc(new_q);
> +			old_q = dev_graft_qdisc(dev_queue, new_q);
> +			WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
> +			htb_offload.command = TC_HTB_NEW_LEAF;
> +			htb_offload.classid = cl->common.classid;
> +			qdisc_offload_graft_helper(qdisc_dev(sch), sch, new_q, old_q,
> +						   TC_SETUP_QDISC_HTB, &htb_offload, extack);
> +			qdisc_put(old_q);
> +		}
>   		if (parent && !parent->level) {
>   			/* turn parent into inner node */
>   			qdisc_purge_queue(parent->leaf.q);
> @@ -1414,11 +1639,24 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
>   			parent->level = (parent->parent ? parent->parent->level
>   					 : TC_HTB_MAXDEPTH) - 1;
>   			memset(&parent->inner, 0, sizeof(parent->inner));
> +
> +			if (q->offload) {
> +				struct netdev_queue *parent_queue =
> +					htb_offload_class_to_queue(sch, parent->common.classid);
> +				old_q = dev_graft_qdisc(parent_queue, NULL);
> +				WARN_ON(old_q != parent_qdisc);
> +				htb_offload.command = TC_HTB_DEL_LEAF;
> +				htb_offload.classid = parent->common.classid;
> +				qdisc_offload_graft_helper(qdisc_dev(sch), sch, NULL,
> +							   parent_qdisc, TC_SETUP_QDISC_HTB,
> +							   &htb_offload, extack);
> +				qdisc_put(old_q);
> +			}
>   		}
> +
>   		/* leaf (we) needs elementary qdisc */
>   		cl->leaf.q = new_q ? new_q : &noop_qdisc;
>   
> -		cl->common.classid = classid;
>   		cl->parent = parent;
>   
>   		/* set class to be in HTB_CAN_SEND state */
> @@ -1557,6 +1795,7 @@ static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
>   }
>   
>   static const struct Qdisc_class_ops htb_class_ops = {
> +	.select_queue	=	htb_select_queue,
>   	.graft		=	htb_graft,
>   	.leaf		=	htb_leaf,
>   	.qlen_notify	=	htb_qlen_notify,
> @@ -1579,6 +1818,7 @@ static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
>   	.dequeue	=	htb_dequeue,
>   	.peek		=	qdisc_peek_dequeued,
>   	.init		=	htb_init,
> +	.attach		=	htb_attach,
>   	.reset		=	htb_reset,
>   	.destroy	=	htb_destroy,
>   	.dump		=	htb_dump,
> diff --git a/tools/include/uapi/linux/pkt_sched.h b/tools/include/uapi/linux/pkt_sched.h
> index 0d18b1d1fbbc..5c903abc9fa5 100644
> --- a/tools/include/uapi/linux/pkt_sched.h
> +++ b/tools/include/uapi/linux/pkt_sched.h
> @@ -414,6 +414,7 @@ enum {
>   	TCA_HTB_RATE64,
>   	TCA_HTB_CEIL64,
>   	TCA_HTB_PAD,
> +	TCA_HTB_OFFLOAD,
>   	__TCA_HTB_MAX,
>   };
>   
> 

Powered by blists - more mailing lists