[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <efiagdwmzfwpdzps74fvcwq3n4cs36q33ij7eebcpssactv3zu@se4hqiwxcfxq>
Date: Fri, 5 Dec 2025 10:51:51 -0800
From: Shakeel Butt <shakeel.butt@...ux.dev>
To: Tejun Heo <tj@...nel.org>
Cc: Johannes Weiner <hannes@...xchg.org>,
Michal Koutný <mkoutny@...e.com>, "Paul E . McKenney" <paulmck@...nel.org>,
JP Kobryn <inwardvessel@...il.com>, Yosry Ahmed <yosry.ahmed@...ux.dev>, linux-mm@...ck.org,
cgroups@...r.kernel.org, linux-kernel@...r.kernel.org,
Meta kernel team <kernel-team@...a.com>
Subject: Re: [PATCH] cgroup: rstat: use LOCK CMPXCHG in css_rstat_updated
On Fri, Dec 05, 2025 at 09:59:39AM -0800, Shakeel Butt wrote:
> On Fri, Dec 05, 2025 at 07:35:30AM -1000, Tejun Heo wrote:
> > Hello,
> >
> > On Thu, Dec 04, 2025 at 06:24:37PM -0800, Shakeel Butt wrote:
> > ...
> > > In Meta's fleet running the kernel with the commit 36df6e3dbd7e, we are
> > > observing on some machines the memcg stats are getting skewed by more
> > > than the actual memory on the system. On close inspection, we noticed
> > > that lockless node for a workload for specific CPU was in the bad state
> > > and thus all the updates on that CPU for that cgroup was being lost. At
> > > the moment, we are not sure if this CMPXCHG without LOCK is the cause of
> > > that but this needs to be fixed irrespective.
> >
> > Is there a plausible theory of events that can explain the skew with the use
> > of this_cpu_cmpxchg()? lnode.next being set to self but this_cpu_cmpxchg()
> > returning something else? It may be useful to write a targeted repro for the
> > particular combination - this_cpu_cmpxchg() vs. remote NULL clearing and see
> > whether this_cpu_cmpxchg() can return a value that doesn't agree with what
> > gets written in the memory.
>
> Yes, I am working on creating a repro for this and will share the
> results.
>
I think I found the repro pasted below and can be built using following
command:
$ gcc -O2 -pthread -o cmpxchg_race cmpxchg_race.c
$ cat cmpxchg_race.c
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <pthread.h>
#include <unistd.h>
#include <stdbool.h>
#include <sched.h>
#include <string.h>
// Kernel-style macros adapted for user-space
#define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
#define WRITE_ONCE(x, val) do { *(volatile typeof(x) *)&(x) = (val); } while (0)
// Simple llist implementation - EXACT KERNEL DEFINITIONS
struct llist_node {
struct llist_node *next;
};
struct llist_head {
struct llist_node *first;
};
// smp_load_acquire - load with acquire semantics
#define smp_load_acquire(p) \
({ \
typeof(*p) ___p1 = READ_ONCE(*p); \
__atomic_thread_fence(__ATOMIC_ACQUIRE); \
___p1; \
})
// try_cmpxchg - compare and exchange, updates old value on failure
static inline bool try_cmpxchg(struct llist_node **ptr, struct llist_node **old, struct llist_node *new)
{
typeof(*ptr) __old = *old;
typeof(*ptr) __ret;
asm volatile("lock cmpxchgq %2, %1"
: "=a" (__ret), "+m" (*ptr)
: "r" (new), "0" (__old)
: "memory");
if (__ret == __old)
return true;
*old = __ret;
return false;
}
/**
* init_llist_node - initialize lock-less list node
* From: include/linux/llist.h:84-87
*/
static inline void init_llist_node(struct llist_node *node)
{
node->next = node;
}
/**
* llist_on_list - test if a lock-list list node is on a list
* From: include/linux/llist.h:98-101
*/
static inline bool llist_on_list(const struct llist_node *node)
{
return node->next != node;
}
/**
* llist_add_batch - add several linked entries in batch
* From: include/linux/llist.h:234-245
*/
static inline bool llist_add_batch(struct llist_node *new_first,
struct llist_node *new_last,
struct llist_head *head)
{
struct llist_node *first = READ_ONCE(head->first);
do {
new_last->next = first;
} while (!try_cmpxchg(&head->first, &first, new_first));
return !first;
}
/**
* llist_add - add a new entry
* From: include/linux/llist.h:263-266
*/
static inline bool llist_add(struct llist_node *new, struct llist_head *head)
{
return llist_add_batch(new, new, head);
}
/**
* llist_del_first - delete the first entry of lock-less list
* From: lib/llist.c:31-43
*/
struct llist_node *llist_del_first(struct llist_head *head)
{
struct llist_node *entry, *next;
entry = smp_load_acquire(&head->first);
do {
if (entry == NULL)
return NULL;
next = READ_ONCE(entry->next);
} while (!try_cmpxchg(&head->first, &entry, next));
return entry;
}
/**
* llist_del_first_init - delete first entry and mark as off-list
* From: include/linux/llist.h:303-310
*/
static inline struct llist_node *llist_del_first_init(struct llist_head *head)
{
struct llist_node *n = llist_del_first(head);
if (n)
init_llist_node(n);
return n;
}
// Global list and node
struct llist_head global_list = { .first = NULL };
struct llist_node node_self;
volatile bool stop = false;
volatile uint64_t success_count = 0;
volatile uint64_t already_count = 0;
volatile uint64_t del_count = 0;
volatile uint64_t empty_count = 0;
bool use_locked_cmpxchg = false;
// CMPXCHG without LOCK
static inline bool cmpxchg_unlocked(struct llist_node **ptr, struct llist_node *old, struct llist_node *new_val)
{
struct llist_node *prev = old;
asm volatile(
"cmpxchgq %2, %1" // No lock prefix!
: "+a" (prev), "+m" (*ptr)
: "r" (new_val)
: "memory"
);
return prev == old;
}
// CMPXCHG with LOCK
static inline bool cmpxchg_locked(struct llist_node **ptr, struct llist_node *old, struct llist_node *new_val)
{
struct llist_node *prev = old;
asm volatile(
"lock cmpxchgq %2, %1" // WITH lock prefix!
: "+a" (prev), "+m" (*ptr)
: "r" (new_val)
: "memory"
);
return prev == old;
}
// Check if node is in the list
bool is_node_in_list(struct llist_head *head, struct llist_node *node)
{
struct llist_node *curr = READ_ONCE(head->first);
while (curr) {
if (curr == node)
return true;
curr = READ_ONCE(curr->next);
}
return false;
}
// Thread 1: Simulates css_rstat_updated()
void *thread_cmpxchg(void *arg)
{
printf("Thread 1 (UPDATER): using %s CMPXCHG\n",
use_locked_cmpxchg ? "LOCKED" : "UNLOCKED");
while (!stop) {
// Try to atomically change from self to NULL (win the race)
bool success;
if (use_locked_cmpxchg) {
success = cmpxchg_locked(&node_self.next, &node_self, NULL);
} else {
success = cmpxchg_unlocked(&node_self.next, &node_self, NULL);
}
if (success) {
// We won! Add to the global list
llist_add(&node_self, &global_list);
success_count++;
} else {
already_count++;
}
}
return NULL;
}
// Thread 2: Simulates css_process_update_tree() -> llist_del_first_init()
void *thread_write(void *arg)
{
printf("Thread 2 (FLUSHER): doing llist_del_first_init\n");
while (!stop) {
// Remove first node from list and reinitialize it (sets next = self)
struct llist_node *node = llist_del_first_init(&global_list);
if (node) {
del_count++;
} else {
empty_count++;
}
}
return NULL;
}
void run_test(bool use_lock, int duration)
{
pthread_t t1, t2;
use_locked_cmpxchg = use_lock;
stop = false;
success_count = 0;
del_count = 0;
empty_count = 0;
already_count = 0;
// Initialize: node_self.next = self (not on list)
init_llist_node(&node_self);
global_list.first = NULL;
printf("\n=== Running test with %s CMPXCHG for %d seconds ===\n",
use_lock ? "LOCKED" : "UNLOCKED", duration);
pthread_create(&t1, NULL, thread_cmpxchg, NULL);
pthread_create(&t2, NULL, thread_write, NULL);
sleep(duration);
stop = true;
pthread_join(t1, NULL);
pthread_join(t2, NULL);
// Check final state
struct llist_node *next = READ_ONCE(node_self.next);
bool on_list = is_node_in_list(&global_list, &node_self);
printf("\n=== Results (%s CMPXCHG) ===\n", use_lock ? "LOCKED" : "UNLOCKED");
printf("Successful cmpxchg: %lu\n", success_count);
printf("Already on list: %lu\n", already_count);
printf("Deletions: %lu\n", del_count);
printf("Empty list: %lu\n", empty_count);
printf("\nFinal state:\n");
printf(" node_self.next: %s\n",
next == NULL ? "NULL" : (next == &node_self ? "self" : "OTHER"));
printf(" On global list: %s\n", on_list ? "YES" : "NO");
// Check for failure condition
bool failed = false;
if (next == NULL && !on_list) {
printf("\n*** FAILURE DETECTED! ***\n");
printf("node_self.next is NULL but node is NOT on the list!\n");
printf("This means we 'won' the cmpxchg race but lost the update.\n");
failed = true;
} else if (next == &node_self && !on_list) {
printf("\n✓ OK: node_self.next is self and not on list (expected state)\n");
} else if (next == NULL && on_list) {
printf("\n✓ OK: node_self.next is NULL and on list (expected state)\n");
} else if (on_list) {
printf("\n✓ OK: node is on list\n");
} else {
printf("\n✓ OK: consistent state\n");
}
if (failed) {
printf("\nThis demonstrates the race condition where:\n");
printf("1. Thread 1 does unlocked cmpxchg(node_self.next, self, NULL) → succeeds\n");
printf("2. Thread 2 does init_llist_node() → writes node_self.next = self\n");
printf("3. Thread 1 thinks it won but the write from Thread 2 was lost\n");
printf("4. Thread 1 adds node to list\n");
printf("5. Thread 2 removes node and does init_llist_node() again\n");
printf("6. Final state: next=NULL but not on list (INCONSISTENT!)\n");
}
}
int main(int argc, char *argv[])
{
int duration = argc > 1 ? atoi(argv[1]) : 3;
printf("=== Simulating css_rstat_updated() Race Condition ===\n");
printf("Using EXACT kernel llist implementations from:\n");
printf(" - include/linux/llist.h (init_llist_node, llist_on_list, llist_add)\n");
printf(" - lib/llist.c (llist_del_first)\n");
printf("\n");
printf("This emulates the exact kernel scenario:\n");
printf(" Thread 1: css_rstat_updated() - cmpxchg + llist_add\n");
printf(" Thread 2: css_process_update_tree() - llist_del_first_init\n");
printf("\n");
// Run with unlocked CMPXCHG (the bug)
run_test(false, duration);
printf("\n");
printf("========================================\n");
// Run with locked CMPXCHG (the fix)
run_test(true, duration);
return 0;
}
Powered by blists - more mailing lists