[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <MWHPR21MB1593ED77DF2C9528269228D9D7119@MWHPR21MB1593.namprd21.prod.outlook.com>
Date: Fri, 16 Jul 2021 17:45:15 +0000
From: Michael Kelley <mikelley@...rosoft.com>
To: Haiyang Zhang <haiyangz@...rosoft.com>,
"linux-hyperv@...r.kernel.org" <linux-hyperv@...r.kernel.org>
CC: Haiyang Zhang <haiyangz@...rosoft.com>,
KY Srinivasan <kys@...rosoft.com>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>
Subject: RE: [PATCH v2,hyperv-fixes] Drivers: hv: vmbus: Fix duplicate CPU
assignments within a device
From: LKML haiyangz <lkmlhyz@...rosoft.com> On Behalf Of Haiyang Zhang Sent: Thursday, July 15, 2021 6:38 PM
>
> The vmbus module uses a rotational algorithm to assign target CPUs to
> device's channels. Depends on the timing of different device's channel
s/device's/a device's/
s/Depends/Depending/
> offers, different channels of a device may be assigned to the same CPU.
>
> For example on a VM with 2 CPUs, if NIC A and B's channels are offered
> in the following order, NIC A will have both channels on CPU0, and
> NIC B will have both channels on CPU1 -- see below. This kind of
> assignment causes RSS load that is spreading across different channels
> to end up on the same CPU.
>
> Timing of channel offers:
> NIC A channel 0
> NIC B channel 0
> NIC A channel 1
> NIC B channel 1
>
> VMBUS ID 14: Class_ID = {f8615163-df3e-46c5-913f-f2d2f965ed0e} - Synthetic network adapter
> Device_ID = {cab064cd-1f31-47d5-a8b4-9d57e320cccd}
> Sysfs path: /sys/bus/vmbus/devices/cab064cd-1f31-47d5-a8b4-9d57e320cccd
> Rel_ID=14, target_cpu=0
> Rel_ID=17, target_cpu=0
>
> VMBUS ID 16: Class_ID = {f8615163-df3e-46c5-913f-f2d2f965ed0e} - Synthetic network adapter
> Device_ID = {244225ca-743e-4020-a17d-d7baa13d6cea}
> Sysfs path: /sys/bus/vmbus/devices/244225ca-743e-4020-a17d-d7baa13d6cea
> Rel_ID=16, target_cpu=1
> Rel_ID=18, target_cpu=1
>
> Update the vmbus CPU assignment algorithm to avoid duplicate CPU
> assignments within a device.
>
> The new algorithm iterates num_online_cpus + 1 times.
> The existing rotational algorithm to find "next NUMA & CPU" is still here.
> But if the resulting CPU is already used by the same device, it will try
> the next CPU.
> In the last iteration, it assigns the channel to the next available CPU
> like the existing algorithm. This is not normally expected, because
> during device probe, we limit the number of channels of a device to
> be <= number of online CPUs.
>
> Signed-off-by: Haiyang Zhang <haiyangz@...rosoft.com>
> ---
> drivers/hv/channel_mgmt.c | 96 ++++++++++++++++++++++++++-------------
> 1 file changed, 64 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> index caf6d0c4bc1b..8584914291e7 100644
> --- a/drivers/hv/channel_mgmt.c
> +++ b/drivers/hv/channel_mgmt.c
> @@ -605,6 +605,17 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
> */
> mutex_lock(&vmbus_connection.channel_mutex);
>
> + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
> + if (guid_equal(&channel->offermsg.offer.if_type,
> + &newchannel->offermsg.offer.if_type) &&
> + guid_equal(&channel->offermsg.offer.if_instance,
> + &newchannel->offermsg.offer.if_instance)) {
> + fnew = false;
> + newchannel->primary_channel = channel;
> + break;
> + }
> + }
> +
> init_vp_index(newchannel);
>
> /* Remember the channels that should be cleaned up upon suspend. */
> @@ -617,16 +628,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
> */
> atomic_dec(&vmbus_connection.offer_in_progress);
>
> - list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
> - if (guid_equal(&channel->offermsg.offer.if_type,
> - &newchannel->offermsg.offer.if_type) &&
> - guid_equal(&channel->offermsg.offer.if_instance,
> - &newchannel->offermsg.offer.if_instance)) {
> - fnew = false;
> - break;
> - }
> - }
> -
> if (fnew) {
> list_add_tail(&newchannel->listentry,
> &vmbus_connection.chn_list);
> @@ -647,7 +648,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
> /*
> * Process the sub-channel.
> */
> - newchannel->primary_channel = channel;
> list_add_tail(&newchannel->sc_list, &channel->sc_list);
> }
>
> @@ -683,6 +683,30 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
> queue_work(wq, &newchannel->add_channel_work);
> }
>
> +/*
> + * Check if CPUs used by other channels of the same device.
> + * It's should only be called by init_vp_index().
s/It's/It/
> + */
> +static bool hv_cpuself_used(u32 cpu, struct vmbus_channel *chn)
> +{
> + struct vmbus_channel *primary = chn->primary_channel;
> + struct vmbus_channel *sc;
> +
> + lockdep_assert_held(&vmbus_connection.channel_mutex);
> +
> + if (!primary)
> + return false;
> +
> + if (primary->target_cpu == cpu)
> + return true;
> +
> + list_for_each_entry(sc, &primary->sc_list, sc_list)
> + if (sc != chn && sc->target_cpu == cpu)
> + return true;
> +
> + return false;
> +}
> +
> /*
> * We use this state to statically distribute the channel interrupt load.
> */
> @@ -702,6 +726,7 @@ static int next_numa_node_id;
> static void init_vp_index(struct vmbus_channel *channel)
> {
> bool perf_chn = hv_is_perf_channel(channel);
> + u32 i, ncpu = num_online_cpus();
> cpumask_var_t available_mask;
> struct cpumask *alloced_mask;
> u32 target_cpu;
> @@ -724,31 +749,38 @@ static void init_vp_index(struct vmbus_channel *channel)
> return;
> }
>
> - while (true) {
> - numa_node = next_numa_node_id++;
> - if (numa_node == nr_node_ids) {
> - next_numa_node_id = 0;
> - continue;
> + for (i = 1; i <= ncpu + 1; i++) {
> + while (true) {
> + numa_node = next_numa_node_id++;
> + if (numa_node == nr_node_ids) {
> + next_numa_node_id = 0;
> + continue;
> + }
> + if (cpumask_empty(cpumask_of_node(numa_node)))
> + continue;
> + break;
> + }
> + alloced_mask = &hv_context.hv_numa_map[numa_node];
> +
> + if (cpumask_weight(alloced_mask) ==
> + cpumask_weight(cpumask_of_node(numa_node))) {
> + /*
> + * We have cycled through all the CPUs in the node;
> + * reset the alloced map.
> + */
> + cpumask_clear(alloced_mask);
> }
> - if (cpumask_empty(cpumask_of_node(numa_node)))
> - continue;
> - break;
> - }
> - alloced_mask = &hv_context.hv_numa_map[numa_node];
>
> - if (cpumask_weight(alloced_mask) ==
> - cpumask_weight(cpumask_of_node(numa_node))) {
> - /*
> - * We have cycled through all the CPUs in the node;
> - * reset the alloced map.
> - */
> - cpumask_clear(alloced_mask);
> - }
> + cpumask_xor(available_mask, alloced_mask,
> + cpumask_of_node(numa_node));
>
> - cpumask_xor(available_mask, alloced_mask, cpumask_of_node(numa_node));
> + target_cpu = cpumask_first(available_mask);
> + cpumask_set_cpu(target_cpu, alloced_mask);
>
> - target_cpu = cpumask_first(available_mask);
> - cpumask_set_cpu(target_cpu, alloced_mask);
> + if (channel->offermsg.offer.sub_channel_index >= ncpu ||
> + i > ncpu || !hv_cpuself_used(target_cpu, channel))
> + break;
> + }
>
> channel->target_cpu = target_cpu;
>
> --
> 2.25.1
I like this approach much better. I did some testing with a variety of
CPU counts (1, 2, 3, 4, 8, and 32), NUMA nodes (1, 3, and 4) and with
4 SCSI controllers and 4 NICs. Everything worked as expected, and it's
definitely an improvement.
I flagged a couple of typos/wording errors, but maybe they can be
fixed when the patch is pulled into hyperv-fixes.
Reviewed-by: Michael Kelley <mikelley@...rosoft.com>
Tested-by: Michael Kelley <mikelley@...rosoft.com>
Powered by blists - more mailing lists