linux-kernel - RE: [PATCH RFC 01/16] prcu: Add PRCU implementation

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Tue, 30 Jan 2018 06:21:05 +0000
From:   "zhangheng (AC)" <heng.z@...wei.com>
To:     Lai Jiangshan <jiangshanlai+lkml@...il.com>,
        "lianglihao@...wei.com" <lianglihao@...wei.com>
CC:     "Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>,
        "Guohanjun (Hanjun Guo)" <guohanjun@...wei.com>,
        "Chenhaibo (Haibo, OS Lab)" <hb.chen@...wei.com>,
        "lihao.liang@...il.com" <lihao.liang@...il.com>,
        LKML <linux-kernel@...r.kernel.org>
Subject: RE: [PATCH RFC 01/16] prcu: Add PRCU implementation

>-----Original Message-----
>From: jiangshanlai@...il.com [mailto:jiangshanlai@...il.com] On Behalf Of Lai Jiangshan
>Sent: 2018年1月29日 17:11
>To: lianglihao@...wei.com
>Cc: Paul E. McKenney <paulmck@...ux.vnet.ibm.com>; Guohanjun (Hanjun Guo) <guohanjun@...wei.com>; zhangheng (AC) <heng.z@...wei.com>; Chenhaibo (Haibo, OS Lab) <hb.chen@...wei.com>; lihao.liang@...il.com; LKML <linux-kernel@...r.kernel.org>
>Subject: Re: [PATCH RFC 01/16] prcu: Add PRCU implementation
>
>On Tue, Jan 23, 2018 at 3:59 PM,  <lianglihao@...wei.com> wrote:
>> From: Heng Zhang <heng.z@...wei.com>
>>
>> This RCU implementation (PRCU) is based on a fast consensus protocol 
>> published in the following paper:
>>
>> Fast Consensus Using Bounded Staleness for Scalable Read-mostly Synchronization.
>> Haibo Chen, Heng Zhang, Ran Liu, Binyu Zang, and Haibing Guan.
>> IEEE Transactions on Parallel and Distributed Systems (TPDS), 2016.
>> https://dl.acm.org/citation.cfm?id=3024114.3024143
>>
>> Signed-off-by: Heng Zhang <heng.z@...wei.com>
>> Signed-off-by: Lihao Liang <lianglihao@...wei.com>
>> ---
>>  include/linux/prcu.h |  37 +++++++++++++++
>>  kernel/rcu/Makefile  |   2 +-
>>  kernel/rcu/prcu.c    | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>  kernel/sched/core.c  |   2 +
>>  4 files changed, 165 insertions(+), 1 deletion(-)  create mode 100644 
>> include/linux/prcu.h  create mode 100644 kernel/rcu/prcu.c
>>
>> diff --git a/include/linux/prcu.h b/include/linux/prcu.h new file mode 
>> 100644 index 00000000..653b4633
>> --- /dev/null
>> +++ b/include/linux/prcu.h
>> @@ -0,0 +1,37 @@
>> +#ifndef __LINUX_PRCU_H
>> +#define __LINUX_PRCU_H
>> +
>> +#include <linux/atomic.h>
>> +#include <linux/mutex.h>
>> +#include <linux/wait.h>
>> +
>> +#define CONFIG_PRCU
>> +
>> +struct prcu_local_struct {
>> +       unsigned int locked;
>> +       unsigned int online;
>> +       unsigned long long version;
>> +};
>> +
>> +struct prcu_struct {
>> +       atomic64_t global_version;
>> +       atomic_t active_ctr;
>> +       struct mutex mtx;
>> +       wait_queue_head_t wait_q;
>> +};
>> +
>> +#ifdef CONFIG_PRCU
>> +void prcu_read_lock(void);
>> +void prcu_read_unlock(void);
>> +void synchronize_prcu(void);
>> +void prcu_note_context_switch(void);
>> +
>> +#else /* #ifdef CONFIG_PRCU */
>> +
>> +#define prcu_read_lock() do {} while (0) #define prcu_read_unlock() 
>> +do {} while (0) #define synchronize_prcu() do {} while (0) #define 
>> +prcu_note_context_switch() do {} while (0)
>> +
>> +#endif /* #ifdef CONFIG_PRCU */
>> +#endif /* __LINUX_PRCU_H */
>> diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 
>> 23803c7d..8791419c 100644
>> --- a/kernel/rcu/Makefile
>> +++ b/kernel/rcu/Makefile
>> @@ -2,7 +2,7 @@
>>  # and is generally not a function of system call inputs.
>>  KCOV_INSTRUMENT := n
>>
>> -obj-y += update.o sync.o
>> +obj-y += update.o sync.o prcu.o
>>  obj-$(CONFIG_CLASSIC_SRCU) += srcu.o
>>  obj-$(CONFIG_TREE_SRCU) += srcutree.o
>>  obj-$(CONFIG_TINY_SRCU) += srcutiny.o diff --git a/kernel/rcu/prcu.c 
>> b/kernel/rcu/prcu.c new file mode 100644 index 00000000..a00b9420
>> --- /dev/null
>> +++ b/kernel/rcu/prcu.c
>> @@ -0,0 +1,125 @@
>> +#include <linux/smp.h>
>> +#include <linux/prcu.h>
>> +#include <linux/percpu.h>
>> +#include <linux/compiler.h>
>> +#include <linux/sched.h>
>> +
>> +#include <asm/barrier.h>
>> +
>> +DEFINE_PER_CPU_SHARED_ALIGNED(struct prcu_local_struct, prcu_local);
>> +
>> +struct prcu_struct global_prcu = {
>> +       .global_version = ATOMIC64_INIT(0),
>> +       .active_ctr = ATOMIC_INIT(0),
>> +       .mtx = __MUTEX_INITIALIZER(global_prcu.mtx),
>> +       .wait_q = __WAIT_QUEUE_HEAD_INITIALIZER(global_prcu.wait_q)
>> +};
>> +struct prcu_struct *prcu = &global_prcu;
>> +
>> +static inline void prcu_report(struct prcu_local_struct *local) {
>> +       unsigned long long global_version;
>> +       unsigned long long local_version;
>> +
>> +       global_version = atomic64_read(&prcu->global_version);
>> +       local_version = local->version;
>> +       if (global_version > local_version)
>> +               cmpxchg(&local->version, local_version, 
>> + global_version);
>
>It is called with irq-disabled, and local->version can't be modified on other cpu. why cmpxchg is needed?

No, it will also be called by prcu_read_unlock in this implementation.

>> +}
>> +
>> +void prcu_read_lock(void)
>> +{
>> +       struct prcu_local_struct *local;
>> +
>> +       local = get_cpu_ptr(&prcu_local);
>> +       if (!local->online) {
>> +               WRITE_ONCE(local->online, 1);
>> +               smp_mb();
>
>What's is the paired code?

It is paired with the mutex_lock in synchronize_prcu.
It is used to ensure that if writer see the online is false, there must be no online reader on this core.

>
>> +       }
>> +
>> +       local->locked++;
>> +       put_cpu_ptr(&prcu_local);
>> +}
>> +EXPORT_SYMBOL(prcu_read_lock);
>> +
>> +void prcu_read_unlock(void)
>> +{
>> +       int locked;
>> +       struct prcu_local_struct *local;
>> +
>> +       barrier();
>> +       local = get_cpu_ptr(&prcu_local);
>> +       locked = local->locked;
>> +       if (locked) {
>> +               local->locked--;
>> +               if (locked == 1)
>> +                       prcu_report(local);
>> +               put_cpu_ptr(&prcu_local);
>> +       } else {
>> +               put_cpu_ptr(&prcu_local);
>> +               if (!atomic_dec_return(&prcu->active_ctr))
>> +                       wake_up(&prcu->wait_q);
>> +       }
>> +}
>> +EXPORT_SYMBOL(prcu_read_unlock);
>> +
>> +static void prcu_handler(void *info)
>> +{
>> +       struct prcu_local_struct *local;
>> +
>> +       local = this_cpu_ptr(&prcu_local);
>> +       if (!local->locked)
>> +               WRITE_ONCE(local->version, 
>> +atomic64_read(&prcu->global_version));
>> +}
>> +
>> +void synchronize_prcu(void)
>> +{
>> +       int cpu;
>> +       cpumask_t cpus;
>
>It might overflow the stack if the cpumask is large, please move it to struct prcu.
OK, thank you.

>> +       unsigned long long version;
>> +       struct prcu_local_struct *local;
>> +
>> +       version = atomic64_add_return(1, &prcu->global_version);
>
>I think this line of code at least causes the following problem.
>
>> +       mutex_lock(&prcu->mtx);
>> +
>> +       local = get_cpu_ptr(&prcu_local);
>> +       local->version = version;
>
>The successful orders of mutex_lock() might not be the same the orders of atomic64_add_return(). In this case,
>local->version will be decreased.

Yes, it should read the global_version again.
But I think it is also correct.

>
>prcu_report() can also happen here now. It is unsure who will change successfully the local->version.
>
>> +       put_cpu_ptr(&prcu_local);
>> +
>> +       cpumask_clear(&cpus);
>> +       for_each_possible_cpu(cpu) {
>> +               local = per_cpu_ptr(&prcu_local, cpu);
>> +               if (!READ_ONCE(local->online))
>> +                       continue;
>
>It seems like reading on local->online is unreliable.

Any problem?

>
>> +               if (READ_ONCE(local->version) < version) {
>
>please handle the cases when version wraps around the maximum.

That's why I just want to use 64bit counters.
Resetting counters needs a full system synchronization, which needs to involve a complex protocol.

>
>> +                       smp_call_function_single(cpu, prcu_handler, 
>> + NULL, 0);
>
>it smells bad when it is in for_each_possible_cpu() loop.
>
>> +                       cpumask_set_cpu(cpu, &cpus);
>> +               }
>> +       }
>> +
>> +       for_each_cpu(cpu, &cpus) {
>> +               local = per_cpu_ptr(&prcu_local, cpu);
>> +               while (READ_ONCE(local->version) < version)
>> +                       cpu_relax();
>> +       }
>>
>
>Ouch, the cpu_relax() loop would take a long time.
>Since it will wait until all the relevant cpus scheduled.
>relevant cpus: prcu reader active cpus.
>So this block of code equals to synchronze_sched() in many cases when prcu is massively used. isn't it?

No, it waits for all online readers to do unlocking. I think the read CS should not be long.
If prcu can be massively used (I don't believe...), I plan to split the singleton prcu into multiple allocable instances.
Because each instance only sents ipis to its own online readers, I think it could work better.

>
>
>smp_mb() /* A paired with B */
>
>> +       if (atomic_read(&prcu->active_ctr))
>> +               wait_event(prcu->wait_q, 
>> + !atomic_read(&prcu->active_ctr));
>> +
>> +       mutex_unlock(&prcu->mtx);
>> +}
>> +EXPORT_SYMBOL(synchronize_prcu);
>> +
>> +void prcu_note_context_switch(void)
>> +{
>> +       struct prcu_local_struct *local;
>> +
>> +       local = get_cpu_ptr(&prcu_local);
>> +       if (local->locked) {
>> +               atomic_add(local->locked, &prcu->active_ctr);
>
>smp_mb() /* B paired with A */

Thank you, I should consider more for non-TSO arch.

>
>> +               local->locked = 0;
>> +       }
>> +       local->online = 0;
>> +       prcu_report(local);
>> +       put_cpu_ptr(&prcu_local);
>> +}
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 
>> 326d4f88..a308581b 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -15,6 +15,7 @@
>>  #include <linux/init_task.h>
>>  #include <linux/context_tracking.h>
>>  #include <linux/rcupdate_wait.h>
>> +#include <linux/prcu.h>
>>
>>  #include <linux/blkdev.h>
>>  #include <linux/kprobes.h>
>> @@ -3383,6 +3384,7 @@ static void __sched notrace __schedule(bool 
>> preempt)
>>
>>         local_irq_disable();
>>         rcu_note_context_switch(preempt);
>> +       prcu_note_context_switch();
>>
>>         /*
>>          * Make sure that signal_pending_state()->signal_pending() 
>> below
>> --
>> 2.14.1.729.g59c0ea183
>>
>