linux-kernel - RE: [PATCH 11/11] scsi: storvsc: Re-init stor_chns when a channel interrupt is re-assigned

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <BN8PR21MB1155C78AAC02D02EA7A7841DCEC20@BN8PR21MB1155.namprd21.prod.outlook.com>
Date:   Mon, 6 Apr 2020 19:54:46 +0000
From:   Long Li <longli@...rosoft.com>
To:     "Andrea Parri (Microsoft)" <parri.andrea@...il.com>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>
CC:     KY Srinivasan <kys@...rosoft.com>,
        Haiyang Zhang <haiyangz@...rosoft.com>,
        Stephen Hemminger <sthemmin@...rosoft.com>,
        Wei Liu <wei.liu@...nel.org>,
        "linux-hyperv@...r.kernel.org" <linux-hyperv@...r.kernel.org>,
        Michael Kelley <mikelley@...rosoft.com>,
        Dexuan Cui <decui@...rosoft.com>,
        Boqun Feng <boqun.feng@...il.com>,
        vkuznets <vkuznets@...hat.com>,
        "James E.J. Bottomley" <jejb@...ux.ibm.com>,
        "Martin K. Petersen" <martin.petersen@...cle.com>,
        "linux-scsi@...r.kernel.org" <linux-scsi@...r.kernel.org>
Subject: RE: [PATCH 11/11] scsi: storvsc: Re-init stor_chns when a channel
 interrupt is re-assigned

>Subject: [PATCH 11/11] scsi: storvsc: Re-init stor_chns when a channel
>interrupt is re-assigned
>
>For each storvsc_device, storvsc keeps track of the channel target CPUs
>associated to the device (alloced_cpus) and it uses this information to fill a
>"cache" (stor_chns) mapping CPU->channel according to a certain heuristic.
>Update the alloced_cpus mask and the stor_chns array when a channel of the
>storvsc device is re-assigned to a different CPU.
>
>Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@...il.com>
>Cc: "James E.J. Bottomley" <jejb@...ux.ibm.com>
>Cc: "Martin K. Petersen" <martin.petersen@...cle.com>
>Cc: <linux-scsi@...r.kernel.org>
>---
> drivers/hv/vmbus_drv.c     |  4 ++
> drivers/scsi/storvsc_drv.c | 95 ++++++++++++++++++++++++++++++++++---
>-
> include/linux/hyperv.h     |  3 ++
> 3 files changed, 94 insertions(+), 8 deletions(-)
>
>diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index
>0f204640c50c2..f0be41bfcbf57 100644
>--- a/drivers/hv/vmbus_drv.c
>+++ b/drivers/hv/vmbus_drv.c
>@@ -1750,6 +1750,10 @@ static ssize_t target_cpu_store(struct
>vmbus_channel *channel,
> 	 * in on a CPU that is different from the channel target_cpu value.
> 	 */
>
>+	if (channel->change_target_cpu_callback)
>+		(*channel->change_target_cpu_callback)(channel,
>+				channel->target_cpu, target_cpu);
>+
> 	channel->target_cpu = target_cpu;
> 	channel->target_vp = hv_cpu_number_to_vp_number(target_cpu);
> 	channel->numa_node = cpu_to_node(target_cpu); diff --git
>a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index
>fb41636519ee8..a680592b9d32a 100644
>--- a/drivers/scsi/storvsc_drv.c
>+++ b/drivers/scsi/storvsc_drv.c
>@@ -621,6 +621,63 @@ static inline struct storvsc_device
>*get_in_stor_device(
>
> }
>
>+void storvsc_change_target_cpu(struct vmbus_channel *channel, u32 old,
>+u32 new) {
>+	struct storvsc_device *stor_device;
>+	struct vmbus_channel *cur_chn;
>+	bool old_is_alloced = false;
>+	struct hv_device *device;
>+	unsigned long flags;
>+	int cpu;
>+
>+	device = channel->primary_channel ?
>+			channel->primary_channel->device_obj
>+				: channel->device_obj;
>+	stor_device = get_out_stor_device(device);
>+	if (!stor_device)
>+		return;
>+
>+	/* See storvsc_do_io() -> get_og_chn(). */
>+	spin_lock_irqsave(&device->channel->lock, flags);
>+
>+	/*
>+	 * Determines if the storvsc device has other channels assigned to
>+	 * the "old" CPU to update the alloced_cpus mask and the stor_chns
>+	 * array.
>+	 */
>+	if (device->channel != channel && device->channel->target_cpu ==
>old) {
>+		cur_chn = device->channel;
>+		old_is_alloced = true;
>+		goto old_is_alloced;
>+	}
>+	list_for_each_entry(cur_chn, &device->channel->sc_list, sc_list) {
>+		if (cur_chn == channel)
>+			continue;
>+		if (cur_chn->target_cpu == old) {
>+			old_is_alloced = true;
>+			goto old_is_alloced;
>+		}
>+	}
>+
>+old_is_alloced:
>+	if (old_is_alloced)
>+		WRITE_ONCE(stor_device->stor_chns[old], cur_chn);
>+	else
>+		cpumask_clear_cpu(old, &stor_device->alloced_cpus);

If the old cpu is not allocated, is it still necessary to do a cpumask_clear_cpu?

>+
>+	/* "Flush" the stor_chns array. */
>+	for_each_possible_cpu(cpu) {
>+		if (stor_device->stor_chns[cpu] && !cpumask_test_cpu(
>+					cpu, &stor_device->alloced_cpus))
>+			WRITE_ONCE(stor_device->stor_chns[cpu], NULL);
>+	}
>+
>+	WRITE_ONCE(stor_device->stor_chns[new], channel);
>+	cpumask_set_cpu(new, &stor_device->alloced_cpus);
>+
>+	spin_unlock_irqrestore(&device->channel->lock, flags); }
>+
> static void handle_sc_creation(struct vmbus_channel *new_sc)  {
> 	struct hv_device *device = new_sc->primary_channel->device_obj;
>@@ -648,6 +705,8 @@ static void handle_sc_creation(struct vmbus_channel
>*new_sc)
> 		return;
> 	}
>
>+	new_sc->change_target_cpu_callback = storvsc_change_target_cpu;
>+
> 	/* Add the sub-channel to the array of available channels. */
> 	stor_device->stor_chns[new_sc->target_cpu] = new_sc;
> 	cpumask_set_cpu(new_sc->target_cpu, &stor_device-
>>alloced_cpus); @@ -876,6 +935,8 @@ static int storvsc_channel_init(struct
>hv_device *device, bool is_fc)
> 	if (stor_device->stor_chns == NULL)
> 		return -ENOMEM;
>
>+	device->channel->change_target_cpu_callback =
>+storvsc_change_target_cpu;
>+
> 	stor_device->stor_chns[device->channel->target_cpu] = device-
>>channel;
> 	cpumask_set_cpu(device->channel->target_cpu,
> 			&stor_device->alloced_cpus);
>@@ -1248,8 +1309,10 @@ static struct vmbus_channel *get_og_chn(struct
>storvsc_device *stor_device,
> 	const struct cpumask *node_mask;
> 	int num_channels, tgt_cpu;
>
>-	if (stor_device->num_sc == 0)
>+	if (stor_device->num_sc == 0) {
>+		stor_device->stor_chns[q_num] = stor_device->device-
>>channel;
> 		return stor_device->device->channel;
>+	}
>
> 	/*
> 	 * Our channel array is sparsley populated and we @@ -1258,7 +1321,6
>@@ static struct vmbus_channel *get_og_chn(struct storvsc_device
>*stor_device,
> 	 * The strategy is simple:
> 	 * I. Ensure NUMA locality
> 	 * II. Distribute evenly (best effort)
>-	 * III. Mapping is persistent.
> 	 */
>
> 	node_mask = cpumask_of_node(cpu_to_node(q_num));
>@@ -1268,8 +1330,10 @@ static struct vmbus_channel *get_og_chn(struct
>storvsc_device *stor_device,
> 		if (cpumask_test_cpu(tgt_cpu, node_mask))
> 			num_channels++;
> 	}
>-	if (num_channels == 0)
>+	if (num_channels == 0) {
>+		stor_device->stor_chns[q_num] = stor_device->device-
>>channel;
> 		return stor_device->device->channel;
>+	}
>
> 	hash_qnum = q_num;
> 	while (hash_qnum >= num_channels)
>@@ -1295,6 +1359,7 @@ static int storvsc_do_io(struct hv_device *device,
> 	struct storvsc_device *stor_device;
> 	struct vstor_packet *vstor_packet;
> 	struct vmbus_channel *outgoing_channel, *channel;
>+	unsigned long flags;
> 	int ret = 0;
> 	const struct cpumask *node_mask;
> 	int tgt_cpu;
>@@ -1308,10 +1373,11 @@ static int storvsc_do_io(struct hv_device *device,
>
> 	request->device  = device;
> 	/*
>-	 * Select an an appropriate channel to send the request out.
>+	 * Select an appropriate channel to send the request out.
> 	 */
>-	if (stor_device->stor_chns[q_num] != NULL) {
>-		outgoing_channel = stor_device->stor_chns[q_num];
>+	/* See storvsc_change_target_cpu(). */
>+	outgoing_channel = READ_ONCE(stor_device->stor_chns[q_num]);
>+	if (outgoing_channel != NULL) {
> 		if (outgoing_channel->target_cpu == q_num) {
> 			/*
> 			 * Ideally, we want to pick a different channel if @@ -
>1324,7 +1390,10 @@ static int storvsc_do_io(struct hv_device *device,
> 					continue;
> 				if (tgt_cpu == q_num)
> 					continue;
>-				channel = stor_device->stor_chns[tgt_cpu];
>+				channel = READ_ONCE(
>+					stor_device->stor_chns[tgt_cpu]);
>+				if (channel == NULL)
>+					continue;
> 				if (hv_get_avail_to_write_percent(
> 							&channel->outbound)
> 						> ring_avail_percent_lowater)
>{
>@@ -1350,7 +1419,10 @@ static int storvsc_do_io(struct hv_device *device,
> 			for_each_cpu(tgt_cpu, &stor_device->alloced_cpus) {
> 				if (cpumask_test_cpu(tgt_cpu, node_mask))
> 					continue;
>-				channel = stor_device->stor_chns[tgt_cpu];
>+				channel = READ_ONCE(
>+					stor_device->stor_chns[tgt_cpu]);
>+				if (channel == NULL)
>+					continue;
> 				if (hv_get_avail_to_write_percent(
> 							&channel->outbound)
> 						> ring_avail_percent_lowater)
>{
>@@ -1360,7 +1432,14 @@ static int storvsc_do_io(struct hv_device *device,
> 			}
> 		}
> 	} else {
>+		spin_lock_irqsave(&device->channel->lock, flags);
>+		outgoing_channel = stor_device->stor_chns[q_num];
>+		if (outgoing_channel != NULL) {
>+			spin_unlock_irqrestore(&device->channel->lock,
>flags);

Checking outgoing_channel again seems unnecessary. Why not just call get_og_chn()?

>+			goto found_channel;
>+		}
> 		outgoing_channel = get_og_chn(stor_device, q_num);
>+		spin_unlock_irqrestore(&device->channel->lock, flags);
> 	}

With device->channel->lock, now we have one more lock on the I/O issuing path. It doesn't seem optimal as you are trying to protect the code in storvsc_change_target_cpu(), this doesn't need to block concurrent I/O issuers. Maybe moving to RCU is a better approach?

Thanks,
Long


>
> found_channel:
>diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index
>edfcd42319ef3..9018b89614b78 100644
>--- a/include/linux/hyperv.h
>+++ b/include/linux/hyperv.h
>@@ -773,6 +773,9 @@ struct vmbus_channel {
> 	void (*onchannel_callback)(void *context);
> 	void *channel_callback_context;
>
>+	void (*change_target_cpu_callback)(struct vmbus_channel *channel,
>+			u32 old, u32 new);
>+
> 	/*
> 	 * Synchronize channel scheduling and channel removal; see the inline
> 	 * comments in vmbus_chan_sched() and vmbus_reset_channel_cb().
>--
>2.24.0