[<prev] [next>] [day] [month] [year] [list]
Message-ID: <YhqtEWToJVZV2npr@linutronix.de>
Date: Sat, 26 Feb 2022 23:43:29 +0100
From: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
To: Thomas Gleixner <tglx@...utronix.de>
Cc: LKML <linux-kernel@...r.kernel.org>,
linux-rt-users@...r.kernel.org,
Steven Rostedt <rostedt@...dmis.org>
Subject: [ANNOUNCE] v5.17-rc5-rt9
Dear RT folks!
I'm pleased to announce the v5.17-rc5-rt9 patch set.
Changes since v5.17-rc5-rt8:
- Update the memcg patches to v5.
- Update the networking patches to v3 and include the follow-up
patches.
- Update the patches for the random subsystem to the latest version as
committed to the the subsystem's git tree.
- Update the delayed task-struct free patches to v2.
- Update the generic_handle_irq_safe() patches to v4.
Known issues
- Valentin Schneider reported a few splats on ARM64, see
https://lkml.kernel.org/r/20210810134127.1394269-1-valentin.schneider@arm.com
The delta patch against v5.17-rc5-rt8 is appended below and can be found here:
https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.17/incr/patch-5.17-rc5-rt8-rt9.patch.xz
You can get this release via the git tree at:
git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v5.17-rc5-rt9
The RT patch against v5.17-rc5 can be found here:
https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.17/older/patch-5.17-rc5-rt9.patch.xz
The split quilt queue is available at:
https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.17/older/patches-5.17-rc5-rt9.tar.xz
Sebastian
diff --git a/drivers/char/random.c b/drivers/char/random.c
index e4bde9c917654..82ee3ed8099dd 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -401,13 +401,12 @@ static void mix_pool_bytes(const void *in, int nbytes)
}
struct fast_pool {
+ u32 pool[4];
struct work_struct mix;
unsigned long last;
- u32 pool[4];
unsigned int count;
u16 reg_idx;
};
-#define FAST_POOL_MIX_INFLIGHT (1U << 31)
/*
* This is a fast mixing routine used by the interrupt randomness
@@ -719,12 +718,13 @@ static size_t crng_fast_load(const u8 *cp, size_t len)
p[crng_init_cnt % CHACHA_KEY_SIZE] ^= *cp;
cp++; crng_init_cnt++; len--; ret++;
}
- spin_unlock_irqrestore(&primary_crng.lock, flags);
if (crng_init_cnt >= CRNG_INIT_CNT_THRESH) {
invalidate_batched_entropy();
crng_init = 1;
- pr_notice("fast init done\n");
}
+ spin_unlock_irqrestore(&primary_crng.lock, flags);
+ if (crng_init == 1)
+ pr_notice("fast init done\n");
return ret;
}
@@ -1006,6 +1006,29 @@ EXPORT_SYMBOL_GPL(add_input_randomness);
static DEFINE_PER_CPU(struct fast_pool, irq_randomness);
+#ifdef CONFIG_SMP
+/*
+ * This function is called when the CPU has just come online, with
+ * entry CPUHP_AP_RANDOM_ONLINE, just after CPUHP_AP_WORKQUEUE_ONLINE.
+ */
+int random_online_cpu(unsigned int cpu)
+{
+ /*
+ * During CPU shutdown and before CPU onlining, add_interrupt_
+ * randomness() may schedule mix_interrupt_randomness(), and
+ * set the MIX_INFLIGHT flag. However, because the worker can
+ * be scheduled on a different CPU during this period, that
+ * flag will never be cleared. For that reason, we zero out
+ * the flag here, which runs just after workqueues are onlined
+ * for the CPU again. This also has the effect of setting the
+ * irq randomness count to zero so that new accumulated irqs
+ * are fresh.
+ */
+ per_cpu_ptr(&irq_randomness, cpu)->count = 0;
+ return 0;
+}
+#endif
+
#ifdef ADD_INTERRUPT_BENCH
static unsigned long avg_cycles, avg_deviation;
@@ -1045,29 +1068,23 @@ static u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
static void mix_interrupt_randomness(struct work_struct *work)
{
struct fast_pool *fast_pool = container_of(work, struct fast_pool, mix);
- u8 pool[sizeof(fast_pool->pool)];
+ u32 pool[4];
- if (unlikely(crng_init == 0)) {
- size_t ret;
-
- ret = crng_fast_load((u8 *)fast_pool->pool, sizeof(fast_pool->pool));
- if (ret) {
- WRITE_ONCE(fast_pool->count, 0);
- fast_pool->last = jiffies;
- return;
- }
+ /* Check to see if we're running on the wrong CPU due to hotplug. */
+ local_irq_disable();
+ if (fast_pool != this_cpu_ptr(&irq_randomness)) {
+ local_irq_enable();
+ return;
}
/*
- * Since this is the result of a trip through the scheduler, xor in
- * a cycle counter. It can't hurt, and might help.
+ * Copy the pool to the stack so that the mixer always has a
+ * consistent view, before we reenable irqs again.
*/
- fast_pool->pool[3] ^= random_get_entropy();
- /* Copy the pool to the stack so that the mixer always has a consistent view. */
memcpy(pool, fast_pool->pool, sizeof(pool));
- /* We take care to zero out the count only after we're done reading the pool. */
- WRITE_ONCE(fast_pool->count, 0);
+ fast_pool->count = 0;
fast_pool->last = jiffies;
+ local_irq_enable();
mix_pool_bytes(pool, sizeof(pool));
credit_entropy_bits(1);
@@ -1076,13 +1093,14 @@ static void mix_interrupt_randomness(struct work_struct *work)
void add_interrupt_randomness(int irq)
{
+ enum { MIX_INFLIGHT = 1U << 31 };
struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
struct pt_regs *regs = get_irq_regs();
unsigned long now = jiffies;
cycles_t cycles = random_get_entropy();
- unsigned int new_count;
u32 c_high, j_high;
u64 ip;
+ unsigned int new_count;
if (cycles == 0)
cycles = get_reg(fast_pool, regs);
@@ -1096,33 +1114,28 @@ void add_interrupt_randomness(int irq)
(sizeof(ip) > 4) ? ip >> 32 : get_reg(fast_pool, regs);
fast_mix(fast_pool);
- add_interrupt_bench(cycles);
new_count = ++fast_pool->count;
+ add_interrupt_bench(cycles);
+
if (unlikely(crng_init == 0)) {
- if (new_count & FAST_POOL_MIX_INFLIGHT)
- return;
-
- if (new_count < 64)
- return;
-
- fast_pool->count |= FAST_POOL_MIX_INFLIGHT;
- if (unlikely(!fast_pool->mix.func))
- INIT_WORK(&fast_pool->mix, mix_interrupt_randomness);
- queue_work_on(raw_smp_processor_id(), system_highpri_wq,
- &fast_pool->mix);
+ if ((new_count >= 64) &&
+ crng_fast_load((u8 *)fast_pool->pool, sizeof(fast_pool->pool)) > 0) {
+ fast_pool->count = 0;
+ fast_pool->last = now;
+ }
return;
}
- if (new_count & FAST_POOL_MIX_INFLIGHT)
+ if (new_count & MIX_INFLIGHT)
return;
- if (new_count < 64 && !time_after(now, fast_pool->last + HZ))
+ if ((new_count < 64) && !time_after(now, fast_pool->last + HZ))
return;
if (unlikely(!fast_pool->mix.func))
INIT_WORK(&fast_pool->mix, mix_interrupt_randomness);
- fast_pool->count |= FAST_POOL_MIX_INFLIGHT;
+ fast_pool->count |= MIX_INFLIGHT;
queue_work_on(raw_smp_processor_id(), system_highpri_wq, &fast_pool->mix);
}
EXPORT_SYMBOL_GPL(add_interrupt_randomness);
@@ -1945,7 +1958,7 @@ u64 get_random_u64(void)
batch = raw_cpu_ptr(&batched_entropy_u64);
next_gen = atomic_read(&batch_generation);
- if (batch->position % ARRAY_SIZE(batch->entropy_u64) == 0 ||
+ if (batch->position >= ARRAY_SIZE(batch->entropy_u64) ||
next_gen != batch->generation) {
extract_crng((u8 *)batch->entropy_u64);
batch->position = 0;
@@ -1976,7 +1989,7 @@ u32 get_random_u32(void)
batch = raw_cpu_ptr(&batched_entropy_u32);
next_gen = atomic_read(&batch_generation);
- if (batch->position % ARRAY_SIZE(batch->entropy_u32) == 0 ||
+ if (batch->position >= ARRAY_SIZE(batch->entropy_u32) ||
next_gen != batch->generation) {
extract_crng((u8 *)batch->entropy_u32);
batch->position = 0;
@@ -1989,6 +2002,24 @@ u32 get_random_u32(void)
}
EXPORT_SYMBOL(get_random_u32);
+#ifdef CONFIG_SMP
+/*
+ * This function is called when the CPU is coming up, with entry
+ * CPUHP_RANDOM_PREPARE, which comes before CPUHP_WORKQUEUE_PREP.
+ */
+int random_prepare_cpu(unsigned int cpu)
+{
+ /*
+ * When the cpu comes back online, immediately invalidate both
+ * the per-cpu crng and all batches, so that we serve fresh
+ * randomness.
+ */
+ per_cpu_ptr(&batched_entropy_u32, cpu)->position = UINT_MAX;
+ per_cpu_ptr(&batched_entropy_u64, cpu)->position = UINT_MAX;
+ return 0;
+}
+#endif
+
/* It's important to invalidate all potential batched entropy that might
* be stored before the crng is initialized, which we can do lazily by
* bumping the generation counter.
diff --git a/drivers/net/amt.c b/drivers/net/amt.c
index f1a36d7e2151c..10455c9b9da0e 100644
--- a/drivers/net/amt.c
+++ b/drivers/net/amt.c
@@ -2373,7 +2373,7 @@ static bool amt_membership_query_handler(struct amt_dev *amt,
skb->pkt_type = PACKET_MULTICAST;
skb->ip_summed = CHECKSUM_NONE;
len = skb->len;
- if (netif_rx(skb) == NET_RX_SUCCESS) {
+ if (__netif_rx(skb) == NET_RX_SUCCESS) {
amt_update_gw_status(amt, AMT_STATUS_RECEIVED_QUERY, true);
dev_sw_netstats_rx_add(amt->dev, len);
} else {
@@ -2470,7 +2470,7 @@ static bool amt_update_handler(struct amt_dev *amt, struct sk_buff *skb)
skb->pkt_type = PACKET_MULTICAST;
skb->ip_summed = CHECKSUM_NONE;
len = skb->len;
- if (netif_rx(skb) == NET_RX_SUCCESS) {
+ if (__netif_rx(skb) == NET_RX_SUCCESS) {
amt_update_relay_status(tunnel, AMT_STATUS_RECEIVED_UPDATE,
true);
dev_sw_netstats_rx_add(amt->dev, len);
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index c1fdd721a730d..a895ff756093a 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -925,7 +925,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
}
skb->protocol = eth_type_trans(skb, geneve->dev);
- netif_rx(skb);
+ __netif_rx(skb);
dst_release(&rt->dst);
return -EMSGSIZE;
}
@@ -1021,7 +1021,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
}
skb->protocol = eth_type_trans(skb, geneve->dev);
- netif_rx(skb);
+ __netif_rx(skb);
dst_release(dst);
return -EMSGSIZE;
}
diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 24e5c54d06c15..bf087171bcf04 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -207,7 +207,7 @@ static int gtp_rx(struct pdp_ctx *pctx, struct sk_buff *skb,
dev_sw_netstats_rx_add(pctx->dev, skb->len);
- netif_rx(skb);
+ __netif_rx(skb);
return 0;
err:
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 77f5b564382b6..d05f86fe78c95 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -78,7 +78,7 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb,
skb_orphan(skb);
- /* Before queueing this packet to netif_rx(),
+ /* Before queueing this packet to __netif_rx(),
* make sure dst is refcounted.
*/
skb_dst_force(skb);
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 3d08743317634..832f09ac075e7 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -1033,7 +1033,7 @@ static enum rx_handler_result handle_not_macsec(struct sk_buff *skb)
else
nskb->pkt_type = PACKET_MULTICAST;
- netif_rx(nskb);
+ __netif_rx(nskb);
}
continue;
}
@@ -1056,7 +1056,7 @@ static enum rx_handler_result handle_not_macsec(struct sk_buff *skb)
nskb->dev = ndev;
- if (netif_rx(nskb) == NET_RX_SUCCESS) {
+ if (__netif_rx(nskb) == NET_RX_SUCCESS) {
u64_stats_update_begin(&secy_stats->syncp);
secy_stats->stats.InPktsUntagged++;
u64_stats_update_end(&secy_stats->syncp);
@@ -1288,7 +1288,7 @@ static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb)
macsec_reset_skb(nskb, macsec->secy.netdev);
- ret = netif_rx(nskb);
+ ret = __netif_rx(nskb);
if (ret == NET_RX_SUCCESS) {
u64_stats_update_begin(&secy_stats->syncp);
secy_stats->stats.InPktsUnknownSCI++;
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 6ef5f77be4d0a..d87c06c317ede 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -410,7 +410,7 @@ static void macvlan_forward_source_one(struct sk_buff *skb,
if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest, dev->dev_addr))
nskb->pkt_type = PACKET_HOST;
- ret = netif_rx(nskb);
+ ret = __netif_rx(nskb);
macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false);
}
@@ -468,7 +468,7 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
/* forward to original port. */
vlan = src;
ret = macvlan_broadcast_one(skb, vlan, eth, 0) ?:
- netif_rx(skb);
+ __netif_rx(skb);
handle_res = RX_HANDLER_CONSUMED;
goto out;
}
diff --git a/drivers/net/mhi_net.c b/drivers/net/mhi_net.c
index aaa628f859fd4..0b1b6f650104b 100644
--- a/drivers/net/mhi_net.c
+++ b/drivers/net/mhi_net.c
@@ -225,7 +225,7 @@ static void mhi_net_dl_callback(struct mhi_device *mhi_dev,
u64_stats_inc(&mhi_netdev->stats.rx_packets);
u64_stats_add(&mhi_netdev->stats.rx_bytes, skb->len);
u64_stats_update_end(&mhi_netdev->stats.rx_syncp);
- netif_rx(skb);
+ __netif_rx(skb);
}
/* Refill if RX buffers queue becomes low */
diff --git a/drivers/net/ntb_netdev.c b/drivers/net/ntb_netdev.c
index 98ca6b18415e7..80bdc07f2cd33 100644
--- a/drivers/net/ntb_netdev.c
+++ b/drivers/net/ntb_netdev.c
@@ -119,7 +119,7 @@ static void ntb_netdev_rx_handler(struct ntb_transport_qp *qp, void *qp_data,
skb->protocol = eth_type_trans(skb, ndev);
skb->ip_summed = CHECKSUM_NONE;
- if (netif_rx(skb) == NET_RX_DROP) {
+ if (__netif_rx(skb) == NET_RX_DROP) {
ndev->stats.rx_errors++;
ndev->stats.rx_dropped++;
} else {
diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c
index 1a95f3beb784d..39e61e07e4894 100644
--- a/drivers/net/rionet.c
+++ b/drivers/net/rionet.c
@@ -109,7 +109,7 @@ static int rionet_rx_clean(struct net_device *ndev)
skb_put(rnet->rx_skb[i], RIO_MAX_MSG_SIZE);
rnet->rx_skb[i]->protocol =
eth_type_trans(rnet->rx_skb[i], ndev);
- error = netif_rx(rnet->rx_skb[i]);
+ error = __netif_rx(rnet->rx_skb[i]);
if (error == NET_RX_DROP) {
ndev->stats.rx_dropped++;
diff --git a/drivers/net/sb1000.c b/drivers/net/sb1000.c
index 57a6d598467b2..c3f8020571add 100644
--- a/drivers/net/sb1000.c
+++ b/drivers/net/sb1000.c
@@ -872,7 +872,7 @@ printk("cm0: IP identification: %02x%02x fragment offset: %02x%02x\n", buffer[3
/* datagram completed: send to upper level */
skb_trim(skb, dlen);
- netif_rx(skb);
+ __netif_rx(skb);
stats->rx_bytes+=dlen;
stats->rx_packets++;
lp->rx_skb[ns] = NULL;
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index d29fb9759cc95..58b20ea171dda 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -287,7 +287,7 @@ static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
{
return __dev_forward_skb(dev, skb) ?: xdp ?
veth_xdp_rx(rq, skb) :
- netif_rx(skb);
+ __netif_rx(skb);
}
/* return true if the specified skb has chances of GRO aggregation
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index e0b1ab99a359e..714cafcf6c6c8 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -418,7 +418,7 @@ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev,
skb->protocol = eth_type_trans(skb, dev);
- if (likely(netif_rx(skb) == NET_RX_SUCCESS))
+ if (likely(__netif_rx(skb) == NET_RX_SUCCESS))
vrf_rx_stats(dev, len);
else
this_cpu_inc(dev->dstats->rx_drps);
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 359d16780dbbc..d0dc90d3dac28 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2541,7 +2541,7 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
tx_stats->tx_bytes += len;
u64_stats_update_end(&tx_stats->syncp);
- if (netif_rx(skb) == NET_RX_SUCCESS) {
+ if (__netif_rx(skb) == NET_RX_SUCCESS) {
u64_stats_update_begin(&rx_stats->syncp);
rx_stats->rx_packets++;
rx_stats->rx_bytes += len;
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 183b90923f51b..a0c883f19a417 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -4160,9 +4160,11 @@ static irqreturn_t dwc3_thread_interrupt(int irq, void *_evt)
unsigned long flags;
irqreturn_t ret = IRQ_NONE;
+ local_bh_disable();
spin_lock_irqsave(&dwc->lock, flags);
ret = dwc3_process_event_buf(evt);
spin_unlock_irqrestore(&dwc->lock, flags);
+ local_bh_enable();
return ret;
}
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 411a428ace4d4..481e565cc5c42 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -100,6 +100,7 @@ enum cpuhp_state {
CPUHP_AP_ARM_CACHE_B15_RAC_DEAD,
CPUHP_PADATA_DEAD,
CPUHP_AP_DTPM_CPU_DEAD,
+ CPUHP_RANDOM_PREPARE,
CPUHP_WORKQUEUE_PREP,
CPUHP_POWER_NUMA_PREPARE,
CPUHP_HRTIMERS_PREPARE,
@@ -240,6 +241,7 @@ enum cpuhp_state {
CPUHP_AP_PERF_CSKY_ONLINE,
CPUHP_AP_WATCHDOG_ONLINE,
CPUHP_AP_WORKQUEUE_ONLINE,
+ CPUHP_AP_RANDOM_ONLINE,
CPUHP_AP_RCUTREE_ONLINE,
CPUHP_AP_BASE_CACHEINFO_ONLINE,
CPUHP_AP_ONLINE_DYN,
diff --git a/include/linux/random.h b/include/linux/random.h
index c45b2693e51fb..b78ac91e6b256 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -158,4 +158,9 @@ static inline bool __init arch_get_random_long_early(unsigned long *v)
}
#endif
+#ifdef CONFIG_SMP
+extern int random_prepare_cpu(unsigned int cpu);
+extern int random_online_cpu(unsigned int cpu);
+#endif
+
#endif /* _LINUX_RANDOM_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 613178f062308..99807ada20a6e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1084,7 +1084,6 @@ struct task_struct {
sigset_t saved_sigmask;
struct sigpending pending;
#ifdef CONFIG_PREEMPT_RT
- /* TODO: move me into ->restart_block ? */
struct kernel_siginfo forced_info;
#endif
unsigned long sas_ss_sp;
diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index ccd1336aa7f42..892562ebbd3aa 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -70,7 +70,6 @@ static inline void *try_get_task_stack(struct task_struct *tsk)
}
extern void put_task_stack(struct task_struct *tsk);
-extern void put_task_stack_sched(struct task_struct *tsk);
#else
static inline void *try_get_task_stack(struct task_struct *tsk)
{
@@ -78,13 +77,6 @@ static inline void *try_get_task_stack(struct task_struct *tsk)
}
static inline void put_task_stack(struct task_struct *tsk) {}
-static inline void put_task_stack_sched(struct task_struct *tsk) {}
-#endif
-
-#ifdef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
-static inline void task_stack_cleanup(struct task_struct *tsk) {}
-#else
-extern void task_stack_cleanup(struct task_struct *tsk);
#endif
void exit_task_stack_account(struct task_struct *tsk);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 407a2568f35eb..01dabccad814e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,6 +34,7 @@
#include <linux/scs.h>
#include <linux/percpu-rwsem.h>
#include <linux/cpuset.h>
+#include <linux/random.h>
#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
@@ -1659,6 +1660,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = perf_event_init_cpu,
.teardown.single = perf_event_exit_cpu,
},
+ [CPUHP_RANDOM_PREPARE] = {
+ .name = "random:prepare",
+ .startup.single = random_prepare_cpu,
+ .teardown.single = NULL,
+ },
[CPUHP_WORKQUEUE_PREP] = {
.name = "workqueue:prepare",
.startup.single = workqueue_prepare_cpu,
@@ -1782,6 +1788,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = workqueue_online_cpu,
.teardown.single = workqueue_offline_cpu,
},
+ [CPUHP_AP_RANDOM_ONLINE] = {
+ .name = "random:online",
+ .startup.single = random_online_cpu,
+ .teardown.single = NULL,
+ },
[CPUHP_AP_RCUTREE_ONLINE] = {
.name = "RCU/tree:online",
.startup.single = rcutree_online_cpu,
diff --git a/kernel/exit.c b/kernel/exit.c
index 293b280d23192..c303cffe7fdb4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -171,7 +171,6 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
kprobe_flush_task(tsk);
perf_event_delayed_put(tsk);
trace_sched_process_free(tsk);
- task_stack_cleanup(tsk);
put_task_struct(tsk);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 416abb6fc218c..1279b57c4ad9e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -179,16 +179,6 @@ static inline void free_task_struct(struct task_struct *tsk)
#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
-#define THREAD_STACK_DELAYED_FREE 1UL
-
-static void thread_stack_mark_delayed_free(struct task_struct *tsk)
-{
- unsigned long val = (unsigned long)tsk->stack;
-
- val |= THREAD_STACK_DELAYED_FREE;
- WRITE_ONCE(tsk->stack, (void *)val);
-}
-
/*
* Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
* kmemcache based allocator.
@@ -203,6 +193,41 @@ static void thread_stack_mark_delayed_free(struct task_struct *tsk)
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+struct vm_stack {
+ struct rcu_head rcu;
+ struct vm_struct *stack_vm_area;
+};
+
+static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
+{
+ unsigned int i;
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL)
+ continue;
+ return true;
+ }
+ return false;
+}
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
+
+ if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
+ return;
+
+ vfree(vm_stack);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct vm_stack *vm_stack = tsk->stack;
+
+ vm_stack->stack_vm_area = tsk->stack_vm_area;
+ call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
+}
+
static int free_vm_stack_cache(unsigned int cpu)
{
struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
@@ -304,31 +329,29 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
return 0;
}
-static void free_thread_stack(struct task_struct *tsk, bool cache_only)
+static void free_thread_stack(struct task_struct *tsk)
{
- int i;
+ if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
+ thread_stack_delayed_free(tsk);
- for (i = 0; i < NR_CACHED_STACKS; i++) {
- if (this_cpu_cmpxchg(cached_stacks[i], NULL,
- tsk->stack_vm_area) != NULL)
- continue;
-
- tsk->stack = NULL;
- tsk->stack_vm_area = NULL;
- return;
- }
- if (cache_only) {
- thread_stack_mark_delayed_free(tsk);
- return;
- }
-
- vfree(tsk->stack);
tsk->stack = NULL;
tsk->stack_vm_area = NULL;
}
# else /* !CONFIG_VMAP_STACK */
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
+
+ call_rcu(rh, thread_stack_free_rcu);
+}
+
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
@@ -341,13 +364,9 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
return -ENOMEM;
}
-static void free_thread_stack(struct task_struct *tsk, bool cache_only)
+static void free_thread_stack(struct task_struct *tsk)
{
- if (cache_only) {
- thread_stack_mark_delayed_free(tsk);
- return;
- }
- __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
+ thread_stack_delayed_free(tsk);
tsk->stack = NULL;
}
@@ -356,6 +375,18 @@ static void free_thread_stack(struct task_struct *tsk, bool cache_only)
static struct kmem_cache *thread_stack_cache;
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ kmem_cache_free(thread_stack_cache, rh);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
+
+ call_rcu(rh, thread_stack_free_rcu);
+}
+
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
unsigned long *stack;
@@ -365,13 +396,9 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
return stack ? 0 : -ENOMEM;
}
-static void free_thread_stack(struct task_struct *tsk, bool cache_only)
+static void free_thread_stack(struct task_struct *tsk)
{
- if (cache_only) {
- thread_stack_mark_delayed_free(tsk);
- return;
- }
- kmem_cache_free(thread_stack_cache, tsk->stack);
+ thread_stack_delayed_free(tsk);
tsk->stack = NULL;
}
@@ -384,19 +411,8 @@ void thread_stack_cache_init(void)
}
# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
-
-void task_stack_cleanup(struct task_struct *tsk)
-{
- unsigned long val = (unsigned long)tsk->stack;
-
- if (!(val & THREAD_STACK_DELAYED_FREE))
- return;
-
- WRITE_ONCE(tsk->stack, (void *)(val & ~THREAD_STACK_DELAYED_FREE));
- free_thread_stack(tsk, false);
-}
-
#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
+
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
unsigned long *stack;
@@ -406,9 +422,10 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
return stack ? 0 : -ENOMEM;
}
-static void free_thread_stack(struct task_struct *tsk, bool cache_only)
+static void free_thread_stack(struct task_struct *tsk)
{
arch_free_thread_stack(tsk);
+ tsk->stack = NULL;
}
#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
@@ -498,25 +515,19 @@ void exit_task_stack_account(struct task_struct *tsk)
}
}
-static void release_task_stack(struct task_struct *tsk, bool cache_only)
+static void release_task_stack(struct task_struct *tsk)
{
if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
return; /* Better to leak the stack than to free prematurely */
- free_thread_stack(tsk, cache_only);
+ free_thread_stack(tsk);
}
#ifdef CONFIG_THREAD_INFO_IN_TASK
void put_task_stack(struct task_struct *tsk)
{
if (refcount_dec_and_test(&tsk->stack_refcount))
- release_task_stack(tsk, false);
-}
-
-void put_task_stack_sched(struct task_struct *tsk)
-{
- if (refcount_dec_and_test(&tsk->stack_refcount))
- release_task_stack(tsk, true);
+ release_task_stack(tsk);
}
#endif
@@ -530,7 +541,7 @@ void free_task(struct task_struct *tsk)
* The task is finally done with both the stack and thread_info,
* so free both.
*/
- release_task_stack(tsk, false);
+ release_task_stack(tsk);
#else
/*
* If the task had a separate stack allocation, it should be gone
@@ -1030,7 +1041,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
free_stack:
exit_task_stack_account(tsk);
- free_thread_stack(tsk, false);
+ free_thread_stack(tsk);
free_tsk:
free_task_struct(tsk);
return NULL;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 97223df2f460e..346d283d2da14 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -663,13 +663,15 @@ int generic_handle_irq(unsigned int irq)
EXPORT_SYMBOL_GPL(generic_handle_irq);
/**
- * generic_handle_irq_safe - Invoke the handler for a particular irq
+ * generic_handle_irq_safe - Invoke the handler for a particular irq from any
+ * context.
* @irq: The irq number to handle
*
- * Returns: 0 on success, or -EINVAL if conversion has failed
+ * Returns: 0 on success, a negative value on error.
*
- * This function must be called either from an IRQ context with irq regs
- * initialized or with care from any context.
+ * This function can be called from any context (IRQ or process context). It
+ * will report an error if not invoked from IRQ context and the irq has been
+ * marked to enforce IRQ-context only.
*/
int generic_handle_irq_safe(unsigned int irq)
{
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d9b85784a1188..327a48b14f00a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4948,11 +4948,8 @@ static struct rq *finish_task_switch(struct task_struct *prev)
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
- /*
- * Cache only the VMAP stack. The final deallocation is in
- * delayed_put_task_struct.
- */
- put_task_stack_sched(prev);
+ /* Task is done with its stack. */
+ put_task_stack(prev);
put_task_struct_rcu_user(prev);
}
diff --git a/localversion-rt b/localversion-rt
index 700c857efd9ba..22746d6390a42 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt8
+-rt9
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a86e984c6b66a..63287fd03250b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -169,6 +169,7 @@ struct mem_cgroup_event {
struct work_struct remove;
};
+static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
/* Stuffs for move charges at task migration. */
@@ -260,10 +261,8 @@ bool mem_cgroup_kmem_disabled(void)
return cgroup_memory_nokmem;
}
-struct memcg_stock_pcp;
static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
- unsigned int nr_pages,
- bool stock_lock_acquried);
+ unsigned int nr_pages);
static void obj_cgroup_release(struct percpu_ref *ref)
{
@@ -297,7 +296,7 @@ static void obj_cgroup_release(struct percpu_ref *ref)
nr_pages = nr_bytes >> PAGE_SHIFT;
if (nr_pages)
- obj_cgroup_uncharge_pages(objcg, nr_pages, false);
+ obj_cgroup_uncharge_pages(objcg, nr_pages);
spin_lock_irqsave(&objcg_lock, flags);
list_del(&objcg->list);
@@ -522,6 +521,43 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
return excess;
}
+static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
+{
+ unsigned long excess;
+ struct mem_cgroup_per_node *mz;
+ struct mem_cgroup_tree_per_node *mctz;
+
+ mctz = soft_limit_tree.rb_tree_per_node[nid];
+ if (!mctz)
+ return;
+ /*
+ * Necessary to update all ancestors when hierarchy is used.
+ * because their event counter is not touched.
+ */
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+ mz = memcg->nodeinfo[nid];
+ excess = soft_limit_excess(memcg);
+ /*
+ * We have to update the tree if mz is on RB-tree or
+ * mem is over its softlimit.
+ */
+ if (excess || mz->on_tree) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&mctz->lock, flags);
+ /* if on-tree, remove it */
+ if (mz->on_tree)
+ __mem_cgroup_remove_exceeded(mz, mctz);
+ /*
+ * Insert again. mz->usage_in_excess will be updated.
+ * If excess is 0, no tree ops.
+ */
+ __mem_cgroup_insert_exceeded(mz, mctz, excess);
+ spin_unlock_irqrestore(&mctz->lock, flags);
+ }
+ }
+}
+
static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
struct mem_cgroup_tree_per_node *mctz;
@@ -593,6 +629,35 @@ static DEFINE_SPINLOCK(stats_flush_lock);
static DEFINE_PER_CPU(unsigned int, stats_updates);
static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
+/*
+ * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
+ * not rely on this as part of an acquired spinlock_t lock. These functions are
+ * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
+ * is sufficient.
+ */
+static void memcg_stats_lock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+ preempt_disable();
+#else
+ VM_BUG_ON(!irqs_disabled());
+#endif
+}
+
+static void __memcg_stats_lock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+ preempt_disable();
+#endif
+}
+
+static void memcg_stats_unlock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+ preempt_enable();
+#endif
+}
+
static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
{
unsigned int x;
@@ -669,8 +734,27 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ /*
+ * The caller from rmap relay on disabled preemption becase they never
+ * update their counter from in-interrupt context. For these two
+ * counters we check that the update is never performed from an
+ * interrupt context while other caller need to have disabled interrupt.
+ */
+ __memcg_stats_lock();
+ if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ switch (idx) {
+ case NR_ANON_MAPPED:
+ case NR_FILE_MAPPED:
+ case NR_ANON_THPS:
+ case NR_SHMEM_PMDMAPPED:
+ case NR_FILE_PMDMAPPED:
+ WARN_ON_ONCE(!in_task());
+ break;
+ default:
+ WARN_ON_ONCE(!irqs_disabled());
+ }
+ }
+
/* Update memcg */
__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
@@ -678,8 +762,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
memcg_rstat_updated(memcg, val);
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ memcg_stats_unlock();
}
/**
@@ -762,12 +845,10 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
if (mem_cgroup_disabled())
return;
- if (IS_ENABLED(PREEMPT_RT))
- preempt_disable();
+ memcg_stats_lock();
__this_cpu_add(memcg->vmstats_percpu->events[idx], count);
memcg_rstat_updated(memcg, count);
- if (IS_ENABLED(PREEMPT_RT))
- preempt_enable();
+ memcg_stats_unlock();
}
static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@ -799,6 +880,53 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
}
+static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
+ enum mem_cgroup_events_target target)
+{
+ unsigned long val, next;
+
+ val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
+ next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
+ /* from time_after() in jiffies.h */
+ if ((long)(next - val) < 0) {
+ switch (target) {
+ case MEM_CGROUP_TARGET_THRESH:
+ next = val + THRESHOLDS_EVENTS_TARGET;
+ break;
+ case MEM_CGROUP_TARGET_SOFTLIMIT:
+ next = val + SOFTLIMIT_EVENTS_TARGET;
+ break;
+ default:
+ break;
+ }
+ __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Check events in order.
+ *
+ */
+static void memcg_check_events(struct mem_cgroup *memcg, int nid)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return;
+
+ /* threshold event is triggered in finer grain than soft limit */
+ if (unlikely(mem_cgroup_event_ratelimit(memcg,
+ MEM_CGROUP_TARGET_THRESH))) {
+ bool do_softlimit;
+
+ do_softlimit = mem_cgroup_event_ratelimit(memcg,
+ MEM_CGROUP_TARGET_SOFTLIMIT);
+ mem_cgroup_threshold(memcg);
+ if (unlikely(do_softlimit))
+ mem_cgroup_update_tree(memcg, nid);
+ }
+}
+
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
/*
@@ -2013,29 +2141,18 @@ void unlock_page_memcg(struct page *page)
folio_memcg_unlock(page_folio(page));
}
-struct obj_stock {
+struct memcg_stock_pcp {
+ local_lock_t stock_lock;
+ struct mem_cgroup *cached; /* this never be root cgroup */
+ unsigned int nr_pages;
+
#ifdef CONFIG_MEMCG_KMEM
struct obj_cgroup *cached_objcg;
struct pglist_data *cached_pgdat;
unsigned int nr_bytes;
int nr_slab_reclaimable_b;
int nr_slab_unreclaimable_b;
-#else
- int dummy[0];
#endif
-};
-
-struct memcg_stock_pcp {
- /* Protects memcg_stock_pcp */
- local_lock_t stock_lock;
- struct mem_cgroup *cached; /* this never be root cgroup */
- unsigned int nr_pages;
-#ifndef CONFIG_PREEMPTION
- /* Protects only task_obj */
- local_lock_t task_obj_lock;
- struct obj_stock task_obj;
-#endif
- struct obj_stock irq_obj;
struct work_struct work;
unsigned long flags;
@@ -2043,21 +2160,16 @@ struct memcg_stock_pcp {
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
.stock_lock = INIT_LOCAL_LOCK(stock_lock),
-#ifndef CONFIG_PREEMPTION
- .task_obj_lock = INIT_LOCAL_LOCK(task_obj_lock),
-#endif
};
static DEFINE_MUTEX(percpu_charge_mutex);
#ifdef CONFIG_MEMCG_KMEM
-static struct obj_cgroup *drain_obj_stock(struct obj_stock *stock,
- bool stock_lock_acquried);
+static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg);
#else
-static inline struct obj_cgroup *drain_obj_stock(struct obj_stock *stock,
- bool stock_lock_acquried)
+static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
{
return NULL;
}
@@ -2124,30 +2236,23 @@ static void drain_stock(struct memcg_stock_pcp *stock)
static void drain_local_stock(struct work_struct *dummy)
{
- struct memcg_stock_pcp *stock_pcp;
- struct obj_cgroup *old;
+ struct memcg_stock_pcp *stock;
+ struct obj_cgroup *old = NULL;
+ unsigned long flags;
/*
* The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
* drain_stock races is that we always operate on local CPU stock
* here with IRQ disabled
*/
-#ifndef CONFIG_PREEMPTION
- local_lock(&memcg_stock.task_obj_lock);
- old = drain_obj_stock(&this_cpu_ptr(&memcg_stock)->task_obj, NULL);
- local_unlock(&memcg_stock.task_obj_lock);
- if (old)
- obj_cgroup_put(old);
-#endif
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
- local_lock_irq(&memcg_stock.stock_lock);
- stock_pcp = this_cpu_ptr(&memcg_stock);
- old = drain_obj_stock(&stock_pcp->irq_obj, stock_pcp);
+ stock = this_cpu_ptr(&memcg_stock);
+ old = drain_obj_stock(stock);
+ drain_stock(stock);
+ clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- drain_stock(stock_pcp);
- clear_bit(FLUSHING_CACHED_CHARGE, &stock_pcp->flags);
-
- local_unlock_irq(&memcg_stock.stock_lock);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
if (old)
obj_cgroup_put(old);
}
@@ -2158,9 +2263,9 @@ static void drain_local_stock(struct work_struct *dummy)
*/
static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
- struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
+ struct memcg_stock_pcp *stock;
- lockdep_assert_held(&stock->stock_lock);
+ stock = this_cpu_ptr(&memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
drain_stock(stock);
css_get(&memcg->css);
@@ -2172,15 +2277,10 @@ static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
drain_stock(stock);
}
-static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
- bool stock_lock_acquried)
+static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
unsigned long flags;
- if (stock_lock_acquried) {
- __refill_stock(memcg, nr_pages);
- return;
- }
local_lock_irqsave(&memcg_stock.stock_lock, flags);
__refill_stock(memcg, nr_pages);
local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
@@ -2192,7 +2292,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
*/
static void drain_all_stock(struct mem_cgroup *root_memcg)
{
- int cpu;
+ int cpu, curcpu;
/* If someone's already draining, avoid adding running more workers. */
if (!mutex_trylock(&percpu_charge_mutex))
@@ -2203,7 +2303,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
* as well as workers from this path always operate on the local
* per-cpu data. CPU up doesn't touch memcg_stock at all.
*/
- cpus_read_lock();
+ migrate_disable();
+ curcpu = smp_processor_id();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
@@ -2219,10 +2320,14 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
rcu_read_unlock();
if (flush &&
- !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
- schedule_work_on(cpu, &stock->work);
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+ if (cpu == curcpu)
+ drain_local_stock(&stock->work);
+ else
+ schedule_work_on(cpu, &stock->work);
+ }
}
- cpus_read_unlock();
+ migrate_enable();
mutex_unlock(&percpu_charge_mutex);
}
@@ -2623,7 +2728,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
done_restock:
if (batch > nr_pages)
- refill_stock(memcg, batch - nr_pages, false);
+ refill_stock(memcg, batch - nr_pages);
/*
* If the hierarchy is above the normal consumption range, schedule
@@ -2725,49 +2830,6 @@ static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
*/
#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
-/*
- * Most kmem_cache_alloc() calls are from user context. The irq disable/enable
- * sequence used in this case to access content from object stock is slow.
- * To optimize for user context access, there are now two object stocks for
- * task context and interrupt context access respectively.
- *
- * The task context object stock can be accessed by disabling preemption only
- * which is cheap in non-preempt kernel. The interrupt context object stock
- * can only be accessed after disabling interrupt. User context code can
- * access interrupt object stock, but not vice versa.
- */
-static inline struct obj_stock *get_obj_stock(unsigned long *pflags,
- bool *stock_lock_acquried)
-{
- struct memcg_stock_pcp *stock;
-
-#ifndef CONFIG_PREEMPTION
- if (likely(in_task())) {
- *pflags = 0UL;
- *stock_lock_acquried = false;
- local_lock(&memcg_stock.task_obj_lock);
- stock = this_cpu_ptr(&memcg_stock);
- return &stock->task_obj;
- }
-#endif
- *stock_lock_acquried = true;
- local_lock_irqsave(&memcg_stock.stock_lock, *pflags);
- stock = this_cpu_ptr(&memcg_stock);
- return &stock->irq_obj;
-}
-
-static inline void put_obj_stock(unsigned long flags,
- bool stock_lock_acquried)
-{
-#ifndef CONFIG_PREEMPTION
- if (likely(!stock_lock_acquried)) {
- local_unlock(&memcg_stock.task_obj_lock);
- return;
- }
-#endif
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
-}
-
/*
* mod_objcg_mlstate() may be called with irq enabled, so
* mod_memcg_lruvec_state() should be used.
@@ -2948,8 +3010,7 @@ static void memcg_free_cache_id(int id)
* @nr_pages: number of pages to uncharge
*/
static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
- unsigned int nr_pages,
- bool stock_lock_acquried)
+ unsigned int nr_pages)
{
struct mem_cgroup *memcg;
@@ -2957,7 +3018,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
page_counter_uncharge(&memcg->kmem, nr_pages);
- refill_stock(memcg, nr_pages, stock_lock_acquried);
+ refill_stock(memcg, nr_pages);
css_put(&memcg->css);
}
@@ -3031,7 +3092,7 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
return;
objcg = __folio_objcg(folio);
- obj_cgroup_uncharge_pages(objcg, nr_pages, false);
+ obj_cgroup_uncharge_pages(objcg, nr_pages);
folio->memcg_data = 0;
obj_cgroup_put(objcg);
}
@@ -3039,21 +3100,21 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
- bool stock_lock_acquried;
- unsigned long flags;
+ struct memcg_stock_pcp *stock;
struct obj_cgroup *old = NULL;
- struct obj_stock *stock;
+ unsigned long flags;
int *bytes;
- stock = get_obj_stock(&flags, &stock_lock_acquried);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ stock = this_cpu_ptr(&memcg_stock);
+
/*
* Save vmstat data in stock and skip vmstat array update unless
* accumulating over a page of vmstat data or when pgdat or idx
* changes.
*/
if (stock->cached_objcg != objcg) {
- old = drain_obj_stock(stock, stock_lock_acquried);
-
+ old = drain_obj_stock(stock);
obj_cgroup_get(objcg);
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
@@ -3097,31 +3158,31 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
if (nr)
mod_objcg_mlstate(objcg, pgdat, idx, nr);
- put_obj_stock(flags, stock_lock_acquried);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
if (old)
obj_cgroup_put(old);
}
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
{
- bool stock_lock_acquried;
+ struct memcg_stock_pcp *stock;
unsigned long flags;
- struct obj_stock *stock;
bool ret = false;
- stock = get_obj_stock(&flags, &stock_lock_acquried);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes;
ret = true;
}
- put_obj_stock(flags, stock_lock_acquried);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
-static struct obj_cgroup *drain_obj_stock(struct obj_stock *stock,
- bool stock_lock_acquried)
+static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
{
struct obj_cgroup *old = stock->cached_objcg;
@@ -3132,8 +3193,18 @@ static struct obj_cgroup *drain_obj_stock(struct obj_stock *stock,
unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
- if (nr_pages)
- obj_cgroup_uncharge_pages(old, nr_pages, stock_lock_acquried);
+ if (nr_pages) {
+ struct mem_cgroup *memcg;
+
+ memcg = get_mem_cgroup_from_objcg(old);
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+
+ __refill_stock(memcg, nr_pages);
+
+ css_put(&memcg->css);
+ }
/*
* The leftover is flushed to the centralized per-memcg value.
@@ -3169,6 +3240,10 @@ static struct obj_cgroup *drain_obj_stock(struct obj_stock *stock,
}
stock->cached_objcg = NULL;
+ /*
+ * The `old' objects needs to be released by the caller via
+ * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
+ */
return old;
}
@@ -3177,15 +3252,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
{
struct mem_cgroup *memcg;
-#ifndef CONFIG_PREEMPTION
- if (in_task() && stock->task_obj.cached_objcg) {
- memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg);
- if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
- return true;
- }
-#endif
- if (stock->irq_obj.cached_objcg) {
- memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg);
+ if (stock->cached_objcg) {
+ memcg = obj_cgroup_memcg(stock->cached_objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
return true;
}
@@ -3196,15 +3264,16 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
bool allow_uncharge)
{
- bool stock_lock_acquried;
- unsigned long flags;
- struct obj_stock *stock;
- unsigned int nr_pages = 0;
+ struct memcg_stock_pcp *stock;
struct obj_cgroup *old = NULL;
+ unsigned long flags;
+ unsigned int nr_pages = 0;
- stock = get_obj_stock(&flags, &stock_lock_acquried);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
if (stock->cached_objcg != objcg) { /* reset if necessary */
- old = drain_obj_stock(stock, stock_lock_acquried);
+ old = drain_obj_stock(stock);
obj_cgroup_get(objcg);
stock->cached_objcg = objcg;
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
@@ -3218,12 +3287,12 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
stock->nr_bytes &= (PAGE_SIZE - 1);
}
- put_obj_stock(flags, stock_lock_acquried);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
if (old)
obj_cgroup_put(old);
if (nr_pages)
- obj_cgroup_uncharge_pages(objcg, nr_pages, false);
+ obj_cgroup_uncharge_pages(objcg, nr_pages);
}
int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
@@ -3743,12 +3812,12 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
}
break;
case RES_SOFT_LIMIT:
-#ifndef CONFIG_PREEMPT_RT
- memcg->soft_limit = nr_pages;
- ret = 0;
-#else
- ret = -EOPNOTSUPP;
-#endif
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ ret = -EOPNOTSUPP;
+ } else {
+ memcg->soft_limit = nr_pages;
+ ret = 0;
+ }
break;
}
return ret ?: nbytes;
@@ -4053,6 +4122,82 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
return 0;
}
+static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
+{
+ struct mem_cgroup_threshold_ary *t;
+ unsigned long usage;
+ int i;
+
+ rcu_read_lock();
+ if (!swap)
+ t = rcu_dereference(memcg->thresholds.primary);
+ else
+ t = rcu_dereference(memcg->memsw_thresholds.primary);
+
+ if (!t)
+ goto unlock;
+
+ usage = mem_cgroup_usage(memcg, swap);
+
+ /*
+ * current_threshold points to threshold just below or equal to usage.
+ * If it's not true, a threshold was crossed after last
+ * call of __mem_cgroup_threshold().
+ */
+ i = t->current_threshold;
+
+ /*
+ * Iterate backward over array of thresholds starting from
+ * current_threshold and check if a threshold is crossed.
+ * If none of thresholds below usage is crossed, we read
+ * only one element of the array here.
+ */
+ for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
+ eventfd_signal(t->entries[i].eventfd, 1);
+
+ /* i = current_threshold + 1 */
+ i++;
+
+ /*
+ * Iterate forward over array of thresholds starting from
+ * current_threshold+1 and check if a threshold is crossed.
+ * If none of thresholds above usage is crossed, we read
+ * only one element of the array here.
+ */
+ for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
+ eventfd_signal(t->entries[i].eventfd, 1);
+
+ /* Update current_threshold */
+ t->current_threshold = i - 1;
+unlock:
+ rcu_read_unlock();
+}
+
+static void mem_cgroup_threshold(struct mem_cgroup *memcg)
+{
+ while (memcg) {
+ __mem_cgroup_threshold(memcg, false);
+ if (do_memsw_account())
+ __mem_cgroup_threshold(memcg, true);
+
+ memcg = parent_mem_cgroup(memcg);
+ }
+}
+
+static int compare_thresholds(const void *a, const void *b)
+{
+ const struct mem_cgroup_threshold *_a = a;
+ const struct mem_cgroup_threshold *_b = b;
+
+ if (_a->threshold > _b->threshold)
+ return 1;
+
+ if (_a->threshold < _b->threshold)
+ return -1;
+
+ return 0;
+}
+
static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
{
struct mem_cgroup_eventfd_list *ev;
@@ -4074,6 +4219,234 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
mem_cgroup_oom_notify_cb(iter);
}
+static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args, enum res_type type)
+{
+ struct mem_cgroup_thresholds *thresholds;
+ struct mem_cgroup_threshold_ary *new;
+ unsigned long threshold;
+ unsigned long usage;
+ int i, size, ret;
+
+ ret = page_counter_memparse(args, "-1", &threshold);
+ if (ret)
+ return ret;
+
+ mutex_lock(&memcg->thresholds_lock);
+
+ if (type == _MEM) {
+ thresholds = &memcg->thresholds;
+ usage = mem_cgroup_usage(memcg, false);
+ } else if (type == _MEMSWAP) {
+ thresholds = &memcg->memsw_thresholds;
+ usage = mem_cgroup_usage(memcg, true);
+ } else
+ BUG();
+
+ /* Check if a threshold crossed before adding a new one */
+ if (thresholds->primary)
+ __mem_cgroup_threshold(memcg, type == _MEMSWAP);
+
+ size = thresholds->primary ? thresholds->primary->size + 1 : 1;
+
+ /* Allocate memory for new array of thresholds */
+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
+ if (!new) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ new->size = size;
+
+ /* Copy thresholds (if any) to new array */
+ if (thresholds->primary)
+ memcpy(new->entries, thresholds->primary->entries,
+ flex_array_size(new, entries, size - 1));
+
+ /* Add new threshold */
+ new->entries[size - 1].eventfd = eventfd;
+ new->entries[size - 1].threshold = threshold;
+
+ /* Sort thresholds. Registering of new threshold isn't time-critical */
+ sort(new->entries, size, sizeof(*new->entries),
+ compare_thresholds, NULL);
+
+ /* Find current threshold */
+ new->current_threshold = -1;
+ for (i = 0; i < size; i++) {
+ if (new->entries[i].threshold <= usage) {
+ /*
+ * new->current_threshold will not be used until
+ * rcu_assign_pointer(), so it's safe to increment
+ * it here.
+ */
+ ++new->current_threshold;
+ } else
+ break;
+ }
+
+ /* Free old spare buffer and save old primary buffer as spare */
+ kfree(thresholds->spare);
+ thresholds->spare = thresholds->primary;
+
+ rcu_assign_pointer(thresholds->primary, new);
+
+ /* To be sure that nobody uses thresholds */
+ synchronize_rcu();
+
+unlock:
+ mutex_unlock(&memcg->thresholds_lock);
+
+ return ret;
+}
+
+static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
+}
+
+static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
+}
+
+static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, enum res_type type)
+{
+ struct mem_cgroup_thresholds *thresholds;
+ struct mem_cgroup_threshold_ary *new;
+ unsigned long usage;
+ int i, j, size, entries;
+
+ mutex_lock(&memcg->thresholds_lock);
+
+ if (type == _MEM) {
+ thresholds = &memcg->thresholds;
+ usage = mem_cgroup_usage(memcg, false);
+ } else if (type == _MEMSWAP) {
+ thresholds = &memcg->memsw_thresholds;
+ usage = mem_cgroup_usage(memcg, true);
+ } else
+ BUG();
+
+ if (!thresholds->primary)
+ goto unlock;
+
+ /* Check if a threshold crossed before removing */
+ __mem_cgroup_threshold(memcg, type == _MEMSWAP);
+
+ /* Calculate new number of threshold */
+ size = entries = 0;
+ for (i = 0; i < thresholds->primary->size; i++) {
+ if (thresholds->primary->entries[i].eventfd != eventfd)
+ size++;
+ else
+ entries++;
+ }
+
+ new = thresholds->spare;
+
+ /* If no items related to eventfd have been cleared, nothing to do */
+ if (!entries)
+ goto unlock;
+
+ /* Set thresholds array to NULL if we don't have thresholds */
+ if (!size) {
+ kfree(new);
+ new = NULL;
+ goto swap_buffers;
+ }
+
+ new->size = size;
+
+ /* Copy thresholds and find current threshold */
+ new->current_threshold = -1;
+ for (i = 0, j = 0; i < thresholds->primary->size; i++) {
+ if (thresholds->primary->entries[i].eventfd == eventfd)
+ continue;
+
+ new->entries[j] = thresholds->primary->entries[i];
+ if (new->entries[j].threshold <= usage) {
+ /*
+ * new->current_threshold will not be used
+ * until rcu_assign_pointer(), so it's safe to increment
+ * it here.
+ */
+ ++new->current_threshold;
+ }
+ j++;
+ }
+
+swap_buffers:
+ /* Swap primary and spare array */
+ thresholds->spare = thresholds->primary;
+
+ rcu_assign_pointer(thresholds->primary, new);
+
+ /* To be sure that nobody uses thresholds */
+ synchronize_rcu();
+
+ /* If all events are unregistered, free the spare array */
+ if (!new) {
+ kfree(thresholds->spare);
+ thresholds->spare = NULL;
+ }
+unlock:
+ mutex_unlock(&memcg->thresholds_lock);
+}
+
+static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
+}
+
+static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
+}
+
+static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ struct mem_cgroup_eventfd_list *event;
+
+ event = kmalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return -ENOMEM;
+
+ spin_lock(&memcg_oom_lock);
+
+ event->eventfd = eventfd;
+ list_add(&event->list, &memcg->oom_notify);
+
+ /* already in OOM ? */
+ if (memcg->under_oom)
+ eventfd_signal(eventfd, 1);
+ spin_unlock(&memcg_oom_lock);
+
+ return 0;
+}
+
+static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ struct mem_cgroup_eventfd_list *ev, *tmp;
+
+ spin_lock(&memcg_oom_lock);
+
+ list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
+ if (ev->eventfd == eventfd) {
+ list_del(&ev->list);
+ kfree(ev);
+ }
+ }
+
+ spin_unlock(&memcg_oom_lock);
+}
+
static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
@@ -4314,7 +4687,6 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
#endif /* CONFIG_CGROUP_WRITEBACK */
-#ifndef CONFIG_PREEMPT_RT
/*
* DO NOT USE IN NEW FILES.
*
@@ -4328,391 +4700,6 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
* possible.
*/
-static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
- enum mem_cgroup_events_target target)
-{
- unsigned long val, next;
-
- val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
- next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
- /* from time_after() in jiffies.h */
- if ((long)(next - val) < 0) {
- switch (target) {
- case MEM_CGROUP_TARGET_THRESH:
- next = val + THRESHOLDS_EVENTS_TARGET;
- break;
- case MEM_CGROUP_TARGET_SOFTLIMIT:
- next = val + SOFTLIMIT_EVENTS_TARGET;
- break;
- default:
- break;
- }
- __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
- return true;
- }
- return false;
-}
-
-static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
-{
- unsigned long excess;
- struct mem_cgroup_per_node *mz;
- struct mem_cgroup_tree_per_node *mctz;
-
- mctz = soft_limit_tree.rb_tree_per_node[nid];
- if (!mctz)
- return;
- /*
- * Necessary to update all ancestors when hierarchy is used.
- * because their event counter is not touched.
- */
- for (; memcg; memcg = parent_mem_cgroup(memcg)) {
- mz = memcg->nodeinfo[nid];
- excess = soft_limit_excess(memcg);
- /*
- * We have to update the tree if mz is on RB-tree or
- * mem is over its softlimit.
- */
- if (excess || mz->on_tree) {
- unsigned long flags;
-
- spin_lock_irqsave(&mctz->lock, flags);
- /* if on-tree, remove it */
- if (mz->on_tree)
- __mem_cgroup_remove_exceeded(mz, mctz);
- /*
- * Insert again. mz->usage_in_excess will be updated.
- * If excess is 0, no tree ops.
- */
- __mem_cgroup_insert_exceeded(mz, mctz, excess);
- spin_unlock_irqrestore(&mctz->lock, flags);
- }
- }
-}
-
-static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
-{
- struct mem_cgroup_threshold_ary *t;
- unsigned long usage;
- int i;
-
- rcu_read_lock();
- if (!swap)
- t = rcu_dereference(memcg->thresholds.primary);
- else
- t = rcu_dereference(memcg->memsw_thresholds.primary);
-
- if (!t)
- goto unlock;
-
- usage = mem_cgroup_usage(memcg, swap);
-
- /*
- * current_threshold points to threshold just below or equal to usage.
- * If it's not true, a threshold was crossed after last
- * call of __mem_cgroup_threshold().
- */
- i = t->current_threshold;
-
- /*
- * Iterate backward over array of thresholds starting from
- * current_threshold and check if a threshold is crossed.
- * If none of thresholds below usage is crossed, we read
- * only one element of the array here.
- */
- for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
- eventfd_signal(t->entries[i].eventfd, 1);
-
- /* i = current_threshold + 1 */
- i++;
-
- /*
- * Iterate forward over array of thresholds starting from
- * current_threshold+1 and check if a threshold is crossed.
- * If none of thresholds above usage is crossed, we read
- * only one element of the array here.
- */
- for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
- eventfd_signal(t->entries[i].eventfd, 1);
-
- /* Update current_threshold */
- t->current_threshold = i - 1;
-unlock:
- rcu_read_unlock();
-}
-
-static void mem_cgroup_threshold(struct mem_cgroup *memcg)
-{
- while (memcg) {
- __mem_cgroup_threshold(memcg, false);
- if (do_memsw_account())
- __mem_cgroup_threshold(memcg, true);
-
- memcg = parent_mem_cgroup(memcg);
- }
-}
-
-/*
- * Check events in order.
- *
- */
-static void memcg_check_events(struct mem_cgroup *memcg, int nid)
-{
- /* threshold event is triggered in finer grain than soft limit */
- if (unlikely(mem_cgroup_event_ratelimit(memcg,
- MEM_CGROUP_TARGET_THRESH))) {
- bool do_softlimit;
-
- do_softlimit = mem_cgroup_event_ratelimit(memcg,
- MEM_CGROUP_TARGET_SOFTLIMIT);
- mem_cgroup_threshold(memcg);
- if (unlikely(do_softlimit))
- mem_cgroup_update_tree(memcg, nid);
- }
-}
-
-static int compare_thresholds(const void *a, const void *b)
-{
- const struct mem_cgroup_threshold *_a = a;
- const struct mem_cgroup_threshold *_b = b;
-
- if (_a->threshold > _b->threshold)
- return 1;
-
- if (_a->threshold < _b->threshold)
- return -1;
-
- return 0;
-}
-
-static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, const char *args, enum res_type type)
-{
- struct mem_cgroup_thresholds *thresholds;
- struct mem_cgroup_threshold_ary *new;
- unsigned long threshold;
- unsigned long usage;
- int i, size, ret;
-
- ret = page_counter_memparse(args, "-1", &threshold);
- if (ret)
- return ret;
-
- mutex_lock(&memcg->thresholds_lock);
-
- if (type == _MEM) {
- thresholds = &memcg->thresholds;
- usage = mem_cgroup_usage(memcg, false);
- } else if (type == _MEMSWAP) {
- thresholds = &memcg->memsw_thresholds;
- usage = mem_cgroup_usage(memcg, true);
- } else
- BUG();
-
- /* Check if a threshold crossed before adding a new one */
- if (thresholds->primary)
- __mem_cgroup_threshold(memcg, type == _MEMSWAP);
-
- size = thresholds->primary ? thresholds->primary->size + 1 : 1;
-
- /* Allocate memory for new array of thresholds */
- new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
- if (!new) {
- ret = -ENOMEM;
- goto unlock;
- }
- new->size = size;
-
- /* Copy thresholds (if any) to new array */
- if (thresholds->primary)
- memcpy(new->entries, thresholds->primary->entries,
- flex_array_size(new, entries, size - 1));
-
- /* Add new threshold */
- new->entries[size - 1].eventfd = eventfd;
- new->entries[size - 1].threshold = threshold;
-
- /* Sort thresholds. Registering of new threshold isn't time-critical */
- sort(new->entries, size, sizeof(*new->entries),
- compare_thresholds, NULL);
-
- /* Find current threshold */
- new->current_threshold = -1;
- for (i = 0; i < size; i++) {
- if (new->entries[i].threshold <= usage) {
- /*
- * new->current_threshold will not be used until
- * rcu_assign_pointer(), so it's safe to increment
- * it here.
- */
- ++new->current_threshold;
- } else
- break;
- }
-
- /* Free old spare buffer and save old primary buffer as spare */
- kfree(thresholds->spare);
- thresholds->spare = thresholds->primary;
-
- rcu_assign_pointer(thresholds->primary, new);
-
- /* To be sure that nobody uses thresholds */
- synchronize_rcu();
-
-unlock:
- mutex_unlock(&memcg->thresholds_lock);
-
- return ret;
-}
-
-static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, const char *args)
-{
- return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
-}
-
-static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, const char *args)
-{
- return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
-}
-
-static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, enum res_type type)
-{
- struct mem_cgroup_thresholds *thresholds;
- struct mem_cgroup_threshold_ary *new;
- unsigned long usage;
- int i, j, size, entries;
-
- mutex_lock(&memcg->thresholds_lock);
-
- if (type == _MEM) {
- thresholds = &memcg->thresholds;
- usage = mem_cgroup_usage(memcg, false);
- } else if (type == _MEMSWAP) {
- thresholds = &memcg->memsw_thresholds;
- usage = mem_cgroup_usage(memcg, true);
- } else
- BUG();
-
- if (!thresholds->primary)
- goto unlock;
-
- /* Check if a threshold crossed before removing */
- __mem_cgroup_threshold(memcg, type == _MEMSWAP);
-
- /* Calculate new number of threshold */
- size = entries = 0;
- for (i = 0; i < thresholds->primary->size; i++) {
- if (thresholds->primary->entries[i].eventfd != eventfd)
- size++;
- else
- entries++;
- }
-
- new = thresholds->spare;
-
- /* If no items related to eventfd have been cleared, nothing to do */
- if (!entries)
- goto unlock;
-
- /* Set thresholds array to NULL if we don't have thresholds */
- if (!size) {
- kfree(new);
- new = NULL;
- goto swap_buffers;
- }
-
- new->size = size;
-
- /* Copy thresholds and find current threshold */
- new->current_threshold = -1;
- for (i = 0, j = 0; i < thresholds->primary->size; i++) {
- if (thresholds->primary->entries[i].eventfd == eventfd)
- continue;
-
- new->entries[j] = thresholds->primary->entries[i];
- if (new->entries[j].threshold <= usage) {
- /*
- * new->current_threshold will not be used
- * until rcu_assign_pointer(), so it's safe to increment
- * it here.
- */
- ++new->current_threshold;
- }
- j++;
- }
-
-swap_buffers:
- /* Swap primary and spare array */
- thresholds->spare = thresholds->primary;
-
- rcu_assign_pointer(thresholds->primary, new);
-
- /* To be sure that nobody uses thresholds */
- synchronize_rcu();
-
- /* If all events are unregistered, free the spare array */
- if (!new) {
- kfree(thresholds->spare);
- thresholds->spare = NULL;
- }
-unlock:
- mutex_unlock(&memcg->thresholds_lock);
-}
-
-static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd)
-{
- return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
-}
-
-static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd)
-{
- return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
-}
-
-static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, const char *args)
-{
- struct mem_cgroup_eventfd_list *event;
-
- event = kmalloc(sizeof(*event), GFP_KERNEL);
- if (!event)
- return -ENOMEM;
-
- spin_lock(&memcg_oom_lock);
-
- event->eventfd = eventfd;
- list_add(&event->list, &memcg->oom_notify);
-
- /* already in OOM ? */
- if (memcg->under_oom)
- eventfd_signal(eventfd, 1);
- spin_unlock(&memcg_oom_lock);
-
- return 0;
-}
-
-static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd)
-{
- struct mem_cgroup_eventfd_list *ev, *tmp;
-
- spin_lock(&memcg_oom_lock);
-
- list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
- if (ev->eventfd == eventfd) {
- list_del(&ev->list);
- kfree(ev);
- }
- }
-
- spin_unlock(&memcg_oom_lock);
-}
-
/*
* Unregister event and free resources.
*
@@ -4806,6 +4793,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
char *endp;
int ret;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return -EOPNOTSUPP;
+
buf = strstrip(buf);
efd = simple_strtoul(buf, &endp, 10);
@@ -4923,18 +4913,6 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
return ret;
}
-#else
-
-static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- return -EOPNOTSUPP;
-}
-
-static void memcg_check_events(struct mem_cgroup *memcg, int nid) { }
-
-#endif
-
#if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
static int mem_cgroup_slab_show(struct seq_file *m, void *p)
{
@@ -6899,7 +6877,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
long nr_pages;
struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
- bool use_objcg = folio_memcg_kmem(folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
@@ -6908,7 +6885,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
* folio memcg or objcg at this point, we have fully
* exclusive access to the folio.
*/
- if (use_objcg) {
+ if (folio_memcg_kmem(folio)) {
objcg = __folio_objcg(folio);
/*
* This get matches the put at the end of the function and
@@ -6936,7 +6913,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
nr_pages = folio_nr_pages(folio);
- if (use_objcg) {
+ if (folio_memcg_kmem(folio)) {
ug->nr_memory += nr_pages;
ug->nr_kmem += nr_pages;
@@ -7116,7 +7093,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
- refill_stock(memcg, nr_pages, false);
+ refill_stock(memcg, nr_pages);
}
static int __init cgroup_memory(char *s)
@@ -7256,18 +7233,10 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* i_pages lock which is taken with interrupts-off. It is
* important here to have the interrupts disabled because it is the
* only synchronisation we have for updating the per-CPU variables.
- * On PREEMPT_RT interrupts are never disabled and the updates to per-CPU
- * variables are synchronised by keeping preemption disabled.
*/
- if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
- VM_BUG_ON(!irqs_disabled());
- mem_cgroup_charge_statistics(memcg, -nr_entries);
- } else {
- preempt_disable();
- mem_cgroup_charge_statistics(memcg, -nr_entries);
- preempt_enable();
- }
-
+ memcg_stats_lock();
+ mem_cgroup_charge_statistics(memcg, -nr_entries);
+ memcg_stats_unlock();
memcg_check_events(memcg, page_to_nid(page));
css_put(&memcg->css);
diff --git a/net/core/dev.c b/net/core/dev.c
index 7ac37cbc42df4..a386d596881d0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4831,26 +4831,40 @@ static int netif_rx_internal(struct sk_buff *skb)
return ret;
}
+/**
+ * __netif_rx - Slightly optimized version of netif_rx
+ * @skb: buffer to post
+ *
+ * This behaves as netif_rx except that it does not disable bottom halves.
+ * As a result this function may only be invoked from the interrupt context
+ * (either hard or soft interrupt).
+ */
int __netif_rx(struct sk_buff *skb)
{
int ret;
+ lockdep_assert_once(hardirq_count() | softirq_count());
+
trace_netif_rx_entry(skb);
ret = netif_rx_internal(skb);
trace_netif_rx_exit(ret);
return ret;
}
+EXPORT_SYMBOL(__netif_rx);
/**
* netif_rx - post buffer to the network code
* @skb: buffer to post
*
* This function receives a packet from a device driver and queues it for
- * the upper (protocol) levels to process. It always succeeds. The buffer
- * may be dropped during processing for congestion control or by the
- * protocol layers.
- * This interface is considered legacy. Modern NIC driver should use NAPI
- * and GRO.
+ * the upper (protocol) levels to process via the backlog NAPI device. It
+ * always succeeds. The buffer may be dropped during processing for
+ * congestion control or by the protocol layers.
+ * The network buffer is passed via the backlog NAPI device. Modern NIC
+ * driver should use NAPI and GRO.
+ * This function can used from interrupt and from process context. The
+ * caller from process context must not disable interrupts before invoking
+ * this function.
*
* return values:
* NET_RX_SUCCESS (no congestion)
@@ -4859,11 +4873,16 @@ int __netif_rx(struct sk_buff *skb)
*/
int netif_rx(struct sk_buff *skb)
{
+ bool need_bh_off = !(hardirq_count() | softirq_count());
int ret;
- local_bh_disable();
- ret = __netif_rx(skb);
- local_bh_enable();
+ if (need_bh_off)
+ local_bh_disable();
+ trace_netif_rx_entry(skb);
+ ret = netif_rx_internal(skb);
+ trace_netif_rx_exit(ret);
+ if (need_bh_off)
+ local_bh_enable();
return ret;
}
EXPORT_SYMBOL(netif_rx);
Powered by blists - more mailing lists