[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <7fbb84a57fc8046738c7196031a3fd97ea8334e2.1748594841.git.libo.gcs85@bytedance.com>
Date: Fri, 30 May 2025 17:27:57 +0800
From: Bo Li <libo.gcs85@...edance.com>
To: tglx@...utronix.de,
mingo@...hat.com,
bp@...en8.de,
dave.hansen@...ux.intel.com,
x86@...nel.org,
luto@...nel.org,
kees@...nel.org,
akpm@...ux-foundation.org,
david@...hat.com,
juri.lelli@...hat.com,
vincent.guittot@...aro.org,
peterz@...radead.org
Cc: dietmar.eggemann@....com,
hpa@...or.com,
acme@...nel.org,
namhyung@...nel.org,
mark.rutland@....com,
alexander.shishkin@...ux.intel.com,
jolsa@...nel.org,
irogers@...gle.com,
adrian.hunter@...el.com,
kan.liang@...ux.intel.com,
viro@...iv.linux.org.uk,
brauner@...nel.org,
jack@...e.cz,
lorenzo.stoakes@...cle.com,
Liam.Howlett@...cle.com,
vbabka@...e.cz,
rppt@...nel.org,
surenb@...gle.com,
mhocko@...e.com,
rostedt@...dmis.org,
bsegall@...gle.com,
mgorman@...e.de,
vschneid@...hat.com,
jannh@...gle.com,
pfalcato@...e.de,
riel@...riel.com,
harry.yoo@...cle.com,
linux-kernel@...r.kernel.org,
linux-perf-users@...r.kernel.org,
linux-fsdevel@...r.kernel.org,
linux-mm@...ck.org,
duanxiongchun@...edance.com,
yinhongbo@...edance.com,
dengliang.1214@...edance.com,
xieyongji@...edance.com,
chaiwen.cc@...edance.com,
songmuchun@...edance.com,
yuanzhu@...edance.com,
chengguozhu@...edance.com,
sunjiadong.lff@...edance.com,
Bo Li <libo.gcs85@...edance.com>
Subject: [RFC v2 29/35] RPAL: fix race condition in pkru update
When setting up MPK, RPAL uses IPIs to notify tasks running on each core
in the thread group to modify their PKRU values and update the PKEY fields
in all VMA page tables. A race condition exists here: when updating PKRU,
the page table updates may not yet be complete. In such cases, writing
PKRU permissions at locations that require calling pkru_write_default()
(e.g., during signal handling) must not be restricted to a single PKEY,
as this would cause PKRU permissions to fail to accommodate both old and
new page table PKEY settings.
This patch introduces a pku_on state with values PKU_ON_FALSE, PKU_ON_INIT,
and PKU_ON_FINISH, representing the states before, during, and after page
table PKEY updates, respectively. For RPAL services, all calls to
pkru_write_default() are replaced with rpal_pkru_write_default().
- Before page table setup (PKU_ON_FALSE), rpal_pkru_write_default()
directly calls pkru_write_default().
- During page table setup (PKU_ON_INIT), rpal_pkru_write_default() enables
permissions for all PKEYs, ensuring the task can access both old and new
page tables simultaneously.
- After page table setup completes (PKU_ON_FINISH),
rpal_pkru_write_default() tightens permissions to match the updated page
tables.
For newly allocated page tables, the new PKEY is only used when pku_on is
PKU_ON_FINISH. The mmap lock is used to ensure no race conditions occur
during this process.
Signed-off-by: Bo Li <libo.gcs85@...edance.com>
---
arch/x86/kernel/cpu/common.c | 4 ++--
arch/x86/kernel/fpu/core.c | 4 ++--
arch/x86/kernel/process.c | 4 ++--
arch/x86/rpal/pku.c | 14 +++++++++++++-
arch/x86/rpal/service.c | 2 +-
include/linux/rpal.h | 9 ++++++++-
mm/mmap.c | 2 +-
mm/mprotect.c | 1 +
mm/vma.c | 2 +-
9 files changed, 31 insertions(+), 11 deletions(-)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2678453cdf76..d21f44873b86 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -534,8 +534,8 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c)
cr4_set_bits(X86_CR4_PKE);
/* Load the default PKRU value */
#ifdef CONFIG_RPAL_PKU
- if (rpal_current_service() && rpal_current_service()->pku_on)
- write_pkru(rpal_pkey_to_pkru(rpal_current_service()->pkey));
+ if (rpal_current_service())
+ rpal_pkru_write_default();
else
#endif
pkru_write_default();
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 251b1ddee726..4b413af0b179 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -748,8 +748,8 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
frstor(&init_fpstate.regs.fsave);
#ifdef CONFIG_RPAL_PKU
- if (rpal_current_service() && rpal_current_service()->pku_on)
- write_pkru(rpal_pkey_to_pkru(rpal_current_service()->pkey));
+ if (rpal_current_service())
+ rpal_pkru_write_default();
else
#endif
pkru_write_default();
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b74de35218f9..898a9e0b23e7 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -286,8 +286,8 @@ static void pkru_flush_thread(void)
* the hardware right here (similar to context switch).
*/
#ifdef CONFIG_RPAL_PKU
- if (rpal_current_service() && rpal_current_service()->pku_on)
- write_pkru(rpal_pkey_to_pkru(rpal_current_service()->pkey));
+ if (rpal_current_service())
+ rpal_pkru_write_default();
else
#endif
pkru_write_default();
diff --git a/arch/x86/rpal/pku.c b/arch/x86/rpal/pku.c
index 26cef324f41f..8e530931fb23 100644
--- a/arch/x86/rpal/pku.c
+++ b/arch/x86/rpal/pku.c
@@ -161,7 +161,7 @@ int rpal_pkey_setup(struct rpal_service *rs, int pkey)
rs->pkey = pkey;
/* others must see rs->pkey before rs->pku_on */
barrier();
- rs->pku_on = true;
+ rs->pku_on = PKU_ON_INIT;
mmap_write_unlock(current->mm);
rpal_set_group_pkru(val, RPAL_PKRU_UNION);
err = do_rpal_mprotect_pkey(rs->base, RPAL_ADDR_SPACE_SIZE, pkey);
@@ -182,3 +182,15 @@ int rpal_alloc_pkey(struct rpal_service *rs, int pkey)
return ret;
}
+
+void rpal_pkru_write_default(void)
+{
+ struct rpal_service *cur = rpal_current_service();
+
+ if (cur->pku_on == PKU_ON_INIT)
+ write_pkru(0);
+ else if (cur->pku_on == PKU_ON_FINISH)
+ write_pkru(rpal_pkey_to_pkru(rpal_current_service()->pkey));
+ else
+ pkru_write_default();
+}
diff --git a/arch/x86/rpal/service.c b/arch/x86/rpal/service.c
index 7a83e85cf096..9fd568fa9a29 100644
--- a/arch/x86/rpal/service.c
+++ b/arch/x86/rpal/service.c
@@ -210,7 +210,7 @@ struct rpal_service *rpal_register_service(void)
init_waitqueue_head(&rs->rpd.rpal_waitqueue);
#ifdef CONFIG_RPAL_PKU
rs->pkey = -1;
- rs->pku_on = false;
+ rs->pku_on = PKU_ON_FALSE;
rpal_service_pku_init();
#endif
diff --git a/include/linux/rpal.h b/include/linux/rpal.h
index 7657e6c6393b..16a3c80383f7 100644
--- a/include/linux/rpal.h
+++ b/include/linux/rpal.h
@@ -138,6 +138,12 @@ enum rpal_capability {
RPAL_CAP_PKU
};
+enum {
+ PKU_ON_FALSE,
+ PKU_ON_INIT,
+ PKU_ON_FINISH,
+};
+
struct rpal_critical_section {
unsigned long ret_begin;
unsigned long ret_end;
@@ -245,7 +251,7 @@ struct rpal_service {
#ifdef CONFIG_RPAL_PKU
/* pkey */
- bool pku_on;
+ int pku_on;
int pkey;
#endif
@@ -599,6 +605,7 @@ __rpal_switch_to(struct task_struct *prev_p, struct task_struct *next_p);
asmlinkage __visible void rpal_schedule_tail(struct task_struct *prev);
int do_rpal_mprotect_pkey(unsigned long start, size_t len, int pkey);
void rpal_set_pku_schedule_tail(struct task_struct *prev);
+void rpal_pkru_write_default(void);
int rpal_ep_autoremove_wake_function(wait_queue_entry_t *curr,
unsigned int mode, int wake_flags,
void *key);
diff --git a/mm/mmap.c b/mm/mmap.c
index d36ea4ea2bd0..85a4a33491ab 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -404,7 +404,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
do {
struct rpal_service *cur = rpal_current_service();
- if (cur && cur->pku_on)
+ if (cur && cur->pku_on == PKU_ON_FINISH)
pkey = cur->pkey;
} while (0);
#endif
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e9ae828e377d..ac162180553e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -938,6 +938,7 @@ int do_rpal_mprotect_pkey(unsigned long start, size_t len, int pkey)
}
tlb_finish_mmu(&tlb);
+ rpal_current_service()->pku_on = PKU_ON_FINISH;
out:
mmap_write_unlock(current->mm);
return error;
diff --git a/mm/vma.c b/mm/vma.c
index fa9d8f694e6e..57ec99a5969d 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -2632,7 +2632,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct rpal_service *cur = rpal_current_service();
unsigned long vma_pkey_mask;
- if (cur && cur->pku_on) {
+ if (cur && cur->pku_on == PKU_ON_FINISH) {
vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 |
VM_PKEY_BIT3;
flags &= ~vma_pkey_mask;
--
2.20.1
Powered by blists - more mailing lists