[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CAL+tcoCp12t_5PRWWkiMi++1MgYX4WXW4dUDXYzHF_tJACw3dg@mail.gmail.com>
Date: Wed, 3 Sep 2025 00:42:41 +0800
From: Jason Xing <kerneljasonxing@...il.com>
To: Xin Zhao <jackzxcui1989@....com>
Cc: willemdebruijn.kernel@...il.com, edumazet@...gle.com, ferenc@...es.dev,
davem@...emloft.net, kuba@...nel.org, pabeni@...hat.com, horms@...nel.org,
netdev@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the
retire operation
On Tue, Sep 2, 2025 at 11:43 PM Jason Xing <kerneljasonxing@...il.com> wrote:
>
> On Sun, Aug 31, 2025 at 6:09 PM Xin Zhao <jackzxcui1989@....com> wrote:
> >
> > In a system with high real-time requirements, the timeout mechanism of
> > ordinary timers with jiffies granularity is insufficient to meet the
> > demands for real-time performance. Meanwhile, the optimization of CPU
> > usage with af_packet is quite significant. Use hrtimer instead of timer
> > to help compensate for the shortcomings in real-time performance.
> > In HZ=100 or HZ=250 system, the update of TP_STATUS_USER is not real-time
> > enough, with fluctuations reaching over 8ms (on a system with HZ=250).
> > This is unacceptable in some high real-time systems that require timely
> > processing of network packets. By replacing it with hrtimer, if a timeout
> > of 2ms is set, the update of TP_STATUS_USER can be stabilized to within
> > 3 ms.
> >
> > Signed-off-by: Xin Zhao <jackzxcui1989@....com>
> > ---
> > Changes in v8:
> > - Simplify the logic related to setting timeout.
> >
> > Changes in v7:
> > - Only update the hrtimer expire time within the hrtimer callback.
> >
> > Changes in v1:
> > - Do not add another config for the current changes.
> >
> > ---
> > net/packet/af_packet.c | 79 +++++++++---------------------------------
> > net/packet/diag.c | 2 +-
> > net/packet/internal.h | 6 ++--
> > 3 files changed, 20 insertions(+), 67 deletions(-)
> >
> > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> > index d4eb4a4fe..3e3bb4216 100644
> > --- a/net/packet/af_packet.c
> > +++ b/net/packet/af_packet.c
> > @@ -203,8 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *,
> > static int prb_queue_frozen(struct tpacket_kbdq_core *);
> > static void prb_open_block(struct tpacket_kbdq_core *,
> > struct tpacket_block_desc *);
> > -static void prb_retire_rx_blk_timer_expired(struct timer_list *);
> > -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
> > +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *);
> > static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
> > static void prb_clear_rxhash(struct tpacket_kbdq_core *,
> > struct tpacket3_hdr *);
> > @@ -579,33 +578,13 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
> > return proto;
> > }
> >
> > -static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
> > -{
> > - timer_delete_sync(&pkc->retire_blk_timer);
> > -}
> > -
> > static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
> > struct sk_buff_head *rb_queue)
> > {
> > struct tpacket_kbdq_core *pkc;
> >
> > pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
> > -
> > - spin_lock_bh(&rb_queue->lock);
> > - pkc->delete_blk_timer = 1;
One more review from my side is that as to the removal of
delete_blk_timer, I'm afraid it deserves a clarification in the commit
message.
> > - spin_unlock_bh(&rb_queue->lock);
> > -
> > - prb_del_retire_blk_timer(pkc);
> > -}
> > -
> > -static void prb_setup_retire_blk_timer(struct packet_sock *po)
> > -{
> > - struct tpacket_kbdq_core *pkc;
> > -
> > - pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
> > - timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
> > - 0);
> > - pkc->retire_blk_timer.expires = jiffies;
> > + hrtimer_cancel(&pkc->retire_blk_timer);
> > }
> >
> > static int prb_calc_retire_blk_tmo(struct packet_sock *po,
> > @@ -671,29 +650,22 @@ static void init_prb_bdqc(struct packet_sock *po,
> > p1->version = po->tp_version;
> > po->stats.stats3.tp_freeze_q_cnt = 0;
> > if (req_u->req3.tp_retire_blk_tov)
> > - p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
> > + p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov);
> > else
> > - p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
> > - req_u->req3.tp_block_size);
> > - p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
> > + p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po,
> > + req_u->req3.tp_block_size));
> > p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
> > rwlock_init(&p1->blk_fill_in_prog_lock);
> >
> > p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
> > prb_init_ft_ops(p1, req_u);
> > - prb_setup_retire_blk_timer(po);
> > + hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired,
> > + CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
> > + hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime,
> > + HRTIMER_MODE_REL_SOFT);
>
> You expect to see it start at the setsockopt phase? Even if it's far
> from the real use of recv at the moment.
>
> > prb_open_block(p1, pbd);
> > }
> >
> > -/* Do NOT update the last_blk_num first.
> > - * Assumes sk_buff_head lock is held.
> > - */
> > -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
> > -{
> > - mod_timer(&pkc->retire_blk_timer,
> > - jiffies + pkc->tov_in_jiffies);
> > -}
> > -
> > /*
> > * Timer logic:
> > * 1) We refresh the timer only when we open a block.
> > @@ -717,7 +689,7 @@ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
> > * prb_calc_retire_blk_tmo() calculates the tmo.
> > *
> > */
> > -static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
> > +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t)
> > {
> > struct packet_sock *po =
> > timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer);
> > @@ -730,9 +702,6 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
> > frozen = prb_queue_frozen(pkc);
> > pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
> >
> > - if (unlikely(pkc->delete_blk_timer))
> > - goto out;
> > -
> > /* We only need to plug the race when the block is partially filled.
> > * tpacket_rcv:
> > * lock(); increment BLOCK_NUM_PKTS; unlock()
> > @@ -749,26 +718,16 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
> > }
> >
> > if (!frozen) {
> > - if (!BLOCK_NUM_PKTS(pbd)) {
> > - /* An empty block. Just refresh the timer. */
> > - goto refresh_timer;
> > + if (BLOCK_NUM_PKTS(pbd)) {
> > + /* Not an empty block. Need retire the block. */
> > + prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
> > + prb_dispatch_next_block(pkc, po);
> > }
> > - prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
> > - if (!prb_dispatch_next_block(pkc, po))
> > - goto refresh_timer;
> > - else
> > - goto out;
> > } else {
> > /* Case 1. Queue was frozen because user-space was
> > * lagging behind.
> > */
> > - if (prb_curr_blk_in_use(pbd)) {
> > - /*
> > - * Ok, user-space is still behind.
> > - * So just refresh the timer.
> > - */
> > - goto refresh_timer;
> > - } else {
> > + if (!prb_curr_blk_in_use(pbd)) {
> > /* Case 2. queue was frozen,user-space caught up,
> > * now the link went idle && the timer fired.
> > * We don't have a block to close.So we open this
> > @@ -777,15 +736,12 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
> > * Thawing/timer-refresh is a side effect.
> > */
> > prb_open_block(pkc, pbd);
> > - goto out;
> > }
> > }
> >
> > -refresh_timer:
> > - _prb_refresh_rx_retire_blk_timer(pkc);
> > -
> > -out:
> > + hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime);
> > spin_unlock(&po->sk.sk_receive_queue.lock);
> > + return HRTIMER_RESTART;
> > }
> >
> > static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
> > @@ -917,7 +873,6 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1,
> > pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
> >
> > prb_thaw_queue(pkc1);
> > - _prb_refresh_rx_retire_blk_timer(pkc1);
>
> Could you say more on why you remove this here and only reset/update
> the expiry time in the timer handler? Probably I missed something
> appearing in the previous long discussion.
I gradually understand your thought behind this modification. You're
trying to move the timer operation out of prb_open_block() and then
spread the timer operation into each caller.
You probably miss the following call trace:
packet_current_rx_frame() -> __packet_lookup_frame_in_block() ->
prb_open_block() -> _prb_refresh_rx_retire_blk_timer()
?
May I ask why bother introducing so many changes like this instead of
leaving it as-is?
Thanks,
Jason
>
> >
> > smp_wmb();
> > }
> > diff --git a/net/packet/diag.c b/net/packet/diag.c
> > index 6ce1dcc28..c8f43e0c1 100644
> > --- a/net/packet/diag.c
> > +++ b/net/packet/diag.c
> > @@ -83,7 +83,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
> > pdr.pdr_frame_nr = ring->frame_max + 1;
> >
> > if (ver > TPACKET_V2) {
> > - pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov;
> > + pdr.pdr_retire_tmo = ktime_to_ms(ring->prb_bdqc.interval_ktime);
> > pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv;
> > pdr.pdr_features = ring->prb_bdqc.feature_req_word;
> > } else {
> > diff --git a/net/packet/internal.h b/net/packet/internal.h
> > index d367b9f93..f8cfd9213 100644
> > --- a/net/packet/internal.h
> > +++ b/net/packet/internal.h
> > @@ -20,7 +20,6 @@ struct tpacket_kbdq_core {
> > unsigned int feature_req_word;
> > unsigned int hdrlen;
> > unsigned char reset_pending_on_curr_blk;
> > - unsigned char delete_blk_timer;
> > unsigned short kactive_blk_num;
> > unsigned short blk_sizeof_priv;
> >
> > @@ -39,12 +38,11 @@ struct tpacket_kbdq_core {
> > /* Default is set to 8ms */
> > #define DEFAULT_PRB_RETIRE_TOV (8)
> >
> > - unsigned short retire_blk_tov;
> > + ktime_t interval_ktime;
> > unsigned short version;
> > - unsigned long tov_in_jiffies;
> >
> > /* timer to retire an outstanding block */
> > - struct timer_list retire_blk_timer;
> > + struct hrtimer retire_blk_timer;
> > };
>
> The whole structure needs a new organization?
>
> Before:
> /* size: 152, cachelines: 3, members: 22 */
> /* sum members: 144, holes: 2, sum holes: 8 */
> /* paddings: 1, sum paddings: 4 */
> /* last cacheline: 24 bytes */
> After:
> /* size: 176, cachelines: 3, members: 19 */
> /* sum members: 163, holes: 4, sum holes: 13 */
> /* paddings: 1, sum paddings: 4 */
> /* forced alignments: 1, forced holes: 1, sum forced holes: 6 */
> /* last cacheline: 48 bytes */
>
> Thanks,
> Jason
>
> >
> > struct pgv {
> > --
> > 2.34.1
> >
> >
Powered by blists - more mailing lists