linux-kernel - Re: [PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat update

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <5560c690cc6de67139a9b2e45c7a11938b70fc58.camel@mediatek.com>
Date: Wed, 17 Jul 2024 03:44:47 +0000
From: Boy Wu (吳勃誼) <Boy.Wu@...iatek.com>
To: "tj@...nel.org" <tj@...nel.org>
CC: "boris@....io" <boris@....io>, "linux-block@...r.kernel.org"
	<linux-block@...r.kernel.org>, "linux-kernel@...r.kernel.org"
	<linux-kernel@...r.kernel.org>, "cgroups@...r.kernel.org"
	<cgroups@...r.kernel.org>, "linux-mediatek@...ts.infradead.org"
	<linux-mediatek@...ts.infradead.org>, "axboe@...nel.dk" <axboe@...nel.dk>,
	Iverlin Wang (王苳霖) <Iverlin.Wang@...iatek.com>,
	"josef@...icpanda.com" <josef@...icpanda.com>,
	"linux-arm-kernel@...ts.infradead.org"
	<linux-arm-kernel@...ts.infradead.org>, "matthias.bgg@...il.com"
	<matthias.bgg@...il.com>, "angelogioacchino.delregno@...labora.com"
	<angelogioacchino.delregno@...labora.com>
Subject: Re: [PATCH v3] blk-cgroup: Replace u64 sync with spinlock for iostat
 update

On Wed, 2024-07-17 at 10:25 +0800, Boy.Wu wrote:
> On Tue, 2024-07-16 at 11:13 -1000, Tejun Heo wrote:
> >  	 
> > External email : Please do not click links or open attachments
> > until
> > you have verified the sender or the content.
> >  Hello, Boy.
> > 
> > So, looking at the patch, I'm not sure per-blkg lock makes sense.
> > 
> > On Tue, Jul 16, 2024 at 03:52:06PM +0800, boy.wu wrote:
> > > @@ -995,15 +995,13 @@ static void blkcg_iostat_update(struct
> > 
> > blkcg_gq *blkg, struct blkg_iostat *cur,
> > >  struct blkg_iostat *last)
> > >  {
> > >  struct blkg_iostat delta;
> > > -unsigned long flags;
> > >  
> > >  /* propagate percpu delta to global */
> > > -flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
> > > +guard(spinlock_irqsave)(&blkg->iostat.spinlock);
> > >  blkg_iostat_set(&delta, cur);
> > >  blkg_iostat_sub(&delta, last);
> > >  blkg_iostat_add(&blkg->iostat.cur, &delta);
> > >  blkg_iostat_add(last, &delta);
> > > -u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
> > >  }
> > 
> > This is already called with blkg_stat_lock held.
> 
> In blkcg_iostat_update, it is protected by both blkg_stat_lock and
> u64
> sync. I agree that no changes are needed here.
> 
> > 
> > > @@ -1051,10 +1048,8 @@ static void __blkcg_rstat_flush(struct
> > > blkcg
> > 
> > *blkcg, int cpu)
> > >  goto propagate_up; /* propagate up to parent only */
> > >  
> > >  /* fetch the current per-cpu values */
> > > -do {
> > > -seq = u64_stats_fetch_begin(&bisc->sync);
> > > +scoped_guard(spinlock_irqsave, &bisc->spinlock)
> > >  blkg_iostat_set(&cur, &bisc->cur);
> > > -} while (u64_stats_fetch_retry(&bisc->sync, seq));
> > 
> > This is per-cpu stat and we should keep using u64_sync for them.
> > 
> > > @@ -1134,9 +1128,8 @@ static void blkcg_fill_root_iostats(void)
> > >  cpu_dkstats->sectors[STAT_DISCARD] << 9;
> > >  }
> > >  
> > > -flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
> > > +guard(spinlock_irqsave)(&blkg->iostat.spinlock);
> > >  blkg_iostat_set(&blkg->iostat.cur, &tmp);
> > > -u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
> > >  }
> > >  }
> > 
> > ...
> > > @@ -1157,16 +1149,14 @@ static void blkcg_print_one_stat(struct
> > 
> > blkcg_gq *blkg, struct seq_file *s)
> > >  
> > >  seq_printf(s, "%s ", dname);
> > >  
> > > -do {
> > > -seq = u64_stats_fetch_begin(&bis->sync);
> > > -
> > > +scoped_guard(spinlock_irqsave, &bis->spinlock) {
> > >  rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
> > >  wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
> > >  dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
> > >  rios = bis->cur.ios[BLKG_IOSTAT_READ];
> > >  wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
> > >  dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
> > > -} while (u64_stats_fetch_retry(&bis->sync, seq));
> > > +}
> > 
> > The above two are the only places which can potentially benefit
> > from
> > per-blkg lock but these aren't hot paths. I'd just use
> > blkg_stat_lock
> > for
> > the above.
> 
> In __blkcg_rstat_flush, u64 sync is used for fetching data, changing
> to
> spinlock will increase overhead both 64 bit and 32 bit systems. 64
> bit
> systems do noting in u64 sync, and 32 bit systems can read data in
> parallel in u64 sync if no one updating data. However, it is already
> protected by blkg_stat_lock, so there is no parallelism now, and
> there
> is no issue here. I think that no changes are needed here.
> 
> In blkcg_fill_root_iostats, this is where the issue happens in 32 bit
> SMP systems, so spinlock needs to be added to protect it.
> 
> In blkcg_print_one_stat, u64 sync is used for fetching data. Changing
> to spinlock will increase overhead like in __blkcg_rstat_flush.
> However, it is already protected by spin_lock_irq(&blkg->q-
> >queue_lock) 
> in blkcg_print_stat, so there is no parallelism now, and there is no
> issue here. I think that no changes are needed here.
> 
> > 
> > > @@ -2152,30 +2141,29 @@ void blk_cgroup_bio_start(struct bio
> > > *bio)
> > >  
> > >  cpu = get_cpu();
> > >  bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
> > > -flags = u64_stats_update_begin_irqsave(&bis->sync);
> > > -
> > > -/*
> > > - * If the bio is flagged with BIO_CGROUP_ACCT it means this is a
> > 
> > split
> > > - * bio and we would have already accounted for the size of the
> > 
> > bio.
> > > - */
> > > -if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
> > > -bio_set_flag(bio, BIO_CGROUP_ACCT);
> > > -bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
> > > -}
> > > -bis->cur.ios[rwd]++;
> > > +scoped_guard(spinlock_irqsave, &bis->spinlock) {
> > > +/*
> > > + * If the bio is flagged with BIO_CGROUP_ACCT it means this is a
> > 
> > split
> > > + * bio and we would have already accounted for the size of the
> > 
> > bio.
> > > + */
> > > +if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
> > > +bio_set_flag(bio, BIO_CGROUP_ACCT);
> > > +bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
> > > +}
> > > +bis->cur.ios[rwd]++;
> > >  
> > > -/*
> > > - * If the iostat_cpu isn't in a lockless list, put it into the
> > > - * list to indicate that a stat update is pending.
> > > - */
> > > -if (!READ_ONCE(bis->lqueued)) {
> > > -struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
> > > +/*
> > > + * If the iostat_cpu isn't in a lockless list, put it into the
> > > + * list to indicate that a stat update is pending.
> > > + */
> > > +if (!READ_ONCE(bis->lqueued)) {
> > > +struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
> > >  
> > > -llist_add(&bis->lnode, lhead);
> > > -WRITE_ONCE(bis->lqueued, true);
> > > +llist_add(&bis->lnode, lhead);
> > > +WRITE_ONCE(bis->lqueued, true);
> > > +}
> > 
> > These are per-cpu stat updates which should keep using u64_sync. We
> > don't
> > want to incur locking overhead for stat updates in the hot issue
> > path.
> > 
> 
> I agree that no changes are needed in blk_cgroup_bio_start.
> 
> > Thanks.
> > 
> > -- 
> > tejun
> 
> I think we can look back to the original issue, which is that on 32
> bit
> SMP systems will have concurrent problems on
> u64_stats_update_begin_irqsave and u64_stats_update_end_irqrestore in
> blkcg_fill_root_iostats. So, adding a lock only on 32 bits systems
> in blkcg_fill_root_iostats is to resolve the concurrent issue and not
> affect 64 bit systems, which do not have the issue in the first
> place,
> because u64 sync does noting in 64 bit systems and they don't need
> it.
> 
> I think we can just fix it by following change.
> 
> @@ -1134,9 +1128,8 @@ static void blkcg_fill_root_iostats(void)
>                                 cpu_dkstats->sectors[STAT_DISCARD] <<
> 9;
>                 }
> 
> +#if BITS_PER_LONG == 32
> +               guard(spinlock_irqsave)(&blkg_stat_lock);
> +#endif
>                 flags = u64_stats_update_begin_irqsave(&blkg-
> > iostat.sync);
> 
>                 blkg_iostat_set(&blkg->iostat.cur, &tmp);
>                 u64_stats_update_end_irqrestore(&blkg->iostat.sync,
> flags);
>         }
>  }

I think there is another way to fix it. Maybe we should address the
root cause, which is that u64_stats_update_begin_irqsave
and u64_stats_update_end_irqrestore do not protect the seqcount in 32
bit SMP systems. This can be fix by the following change.

diff --git a/include/linux/u64_stats_sync.h
b/include/linux/u64_stats_sync.h
index 46040d66334a..94dd74b4fb2c 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -64,6 +64,7 @@
 struct u64_stats_sync {
 #if BITS_PER_LONG == 32
        seqcount_t      seq;
+       spinlock_t      spinlock;
 #endif
 };

@@ -138,6 +139,7 @@ static inline void u64_stats_inc(u64_stats_t *p)
 static inline void u64_stats_init(struct u64_stats_sync *syncp)
 {
        seqcount_init(&syncp->seq);
+       spin_lock_init(&syncp->spinlock)
 }

 static inline void __u64_stats_update_begin(struct u64_stats_sync
*syncp)
@@ -191,6 +193,7 @@ static inline unsigned long
u64_stats_update_begin_irqsave(struct u64_stats_sync
 {
        unsigned long flags = __u64_stats_irqsave();

+       spin_lock_irq(&syncp->spinlock);
        __u64_stats_update_begin(syncp);
        return flags;
 }
@@ -199,6 +202,7 @@ static inline void
u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp,
                                                   unsigned long flags)
 {
        __u64_stats_update_end(syncp);
+       spin_unlock_irq(&syncp->spinlock);
        __u64_stats_irqrestore(flags);
 }

--
boy.wu