[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <589d980f-2e4d-47b4-9dc7-8c64dbe271ce@redhat.com>
Date: Mon, 18 Mar 2024 09:41:45 +1000
From: Gavin Shan <gshan@...hat.com>
To: "Michael S. Tsirkin" <mst@...hat.com>
Cc: virtualization@...ts.linux.dev, linux-kernel@...r.kernel.org,
jasowang@...hat.com, xuanzhuo@...ux.alibaba.com, yihyu@...hat.com,
shan.gavin@...il.com, Will Deacon <will@...nel.org>,
Catalin Marinas <catalin.marinas@....com>,
linux-arm-kernel@...ts.infradead.org, mochs@...dia.com
Subject: Re: [PATCH] virtio_ring: Fix the stale index in available ring
On 3/18/24 02:50, Michael S. Tsirkin wrote:
> On Fri, Mar 15, 2024 at 09:24:36PM +1000, Gavin Shan wrote:
>>
>> On 3/15/24 21:05, Michael S. Tsirkin wrote:
>>> On Fri, Mar 15, 2024 at 08:45:10PM +1000, Gavin Shan wrote:
>>>>>> Yes, I guess smp_wmb() ('dmb') is buggy on NVidia's grace-hopper platform. I tried
>>>> to reproduce it with my own driver where one thread writes to the shared buffer
>>>> and another thread reads from the buffer. I don't hit the out-of-order issue so
>>>> far.
>>>
>>> Make sure the 2 areas you are accessing are in different cache lines.
>>>
>>
>> Yes, I already put those 2 areas to separate cache lines.
>>
>>>
>>>> My driver may be not correct somewhere and I will update if I can reproduce
>>>> the issue with my driver in the future.
>>>
>>> Then maybe your change is just making virtio slower and masks the bug
>>> that is actually elsewhere?
>>>
>>> You don't really need a driver. Here's a simple test: without barriers
>>> assertion will fail. With barriers it will not.
>>> (Warning: didn't bother testing too much, could be buggy.
>>>
>>> ---
>>>
>>> #include <pthread.h>
>>> #include <stdio.h>
>>> #include <stdlib.h>
>>> #include <assert.h>
>>>
>>> #define FIRST values[0]
>>> #define SECOND values[64]
>>>
>>> volatile int values[100] = {};
>>>
>>> void* writer_thread(void* arg) {
>>> while (1) {
>>> FIRST++;
>>> // NEED smp_wmb here
>> __asm__ volatile("dmb ishst" : : : "memory");
>>> SECOND++;
>>> }
>>> }
>>>
>>> void* reader_thread(void* arg) {
>>> while (1) {
>>> int first = FIRST;
>>> // NEED smp_rmb here
>> __asm__ volatile("dmb ishld" : : : "memory");
>>> int second = SECOND;
>>> assert(first - second == 1 || first - second == 0);
>>> }
>>> }
>>>
>>> int main() {
>>> pthread_t writer, reader;
>>>
>>> pthread_create(&writer, NULL, writer_thread, NULL);
>>> pthread_create(&reader, NULL, reader_thread, NULL);
>>>
>>> pthread_join(writer, NULL);
>>> pthread_join(reader, NULL);
>>>
>>> return 0;
>>> }
>>>
>>
>> Had a quick test on NVidia's grace-hopper and Ampere's CPUs. I hit
>> the assert on both of them. After replacing 'dmb' with 'dsb', I can
>> hit assert on both of them too. I need to look at the code closely.
>>
>> [root@...t-mtcollins-02 test]# ./a
>> a: a.c:26: reader_thread: Assertion `first - second == 1 || first - second == 0' failed.
>> Aborted (core dumped)
>>
>> [root@...dia-grace-hopper-05 test]# ./a
>> a: a.c:26: reader_thread: Assertion `first - second == 1 || first - second == 0' failed.
>> Aborted (core dumped)
>>
>> Thanks,
>> Gavin
>
>
> Actually this test is broken. No need for ordering it's a simple race.
> The following works on x86 though (x86 does not need barriers
> though).
>
>
> #include <pthread.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <assert.h>
>
> #if 0
> #define x86_rmb() asm volatile("lfence":::"memory")
> #define x86_mb() asm volatile("mfence":::"memory")
> #define x86_smb() asm volatile("sfence":::"memory")
> #else
> #define x86_rmb() asm volatile("":::"memory")
> #define x86_mb() asm volatile("":::"memory")
> #define x86_smb() asm volatile("":::"memory")
> #endif
>
> #define FIRST values[0]
> #define SECOND values[640]
> #define FLAG values[1280]
>
> volatile unsigned values[2000] = {};
>
> void* writer_thread(void* arg) {
> while (1) {
> /* Now synchronize with reader */
> while(FLAG);
> FIRST++;
> x86_smb();
> SECOND++;
> x86_smb();
> FLAG = 1;
> }
> }
>
> void* reader_thread(void* arg) {
> while (1) {
> /* Now synchronize with writer */
> while(!FLAG);
> x86_rmb();
> unsigned first = FIRST;
> x86_rmb();
> unsigned second = SECOND;
> assert(first - second == 1 || first - second == 0);
> FLAG = 0;
>
> if (!(first %1000000))
> printf("%d\n", first);
> }
> }
>
> int main() {
> pthread_t writer, reader;
>
> pthread_create(&writer, NULL, writer_thread, NULL);
> pthread_create(&reader, NULL, reader_thread, NULL);
>
> pthread_join(writer, NULL);
> pthread_join(reader, NULL);
>
> return 0;
> }
>
I tried it on host and VM of NVidia's grace-hopper. Without the barriers, I
can hit assert. With the barriers, it's working fine without hitting the
assert.
I also had some code to mimic virtio vring last weekend, and it's just
working well. Back to our original issue, __smb_wmb() is issued by guest
while __smb_rmb() is executed on host. The VM and host are running at
different exception level: EL2 vs EL1. I'm not sure it's the cause. I
need to modify my code so that __smb_wmb() and __smb_rmb() can be executed
from guest and host.
[gshan@...an code]$ cat test.h
#ifndef __TEST_H
#define __TEST_H
struct vring_desc {
uint64_t addr;
uint32_t len;
uint16_t flags;
uint16_t next;
} __attribute__((aligned(4)));
struct vring_avail {
uint16_t flags;
uint16_t idx;
uint16_t ring[];
} __attribute__((aligned(4)));
struct vring_used_elem {
uint32_t id;
uint32_t len;
} __attribute__((aligned(4)));
struct vring_used {
uint16_t flags;
uint16_t idx;
struct vring_used_elem ring[];
} __attribute__((aligned(4)));
struct vring {
struct vring_desc *desc;
struct vring_avail *avail;
struct vring_used *used;
uint8_t pad0[64];
/* Writer */
uint32_t num;
uint32_t w_num_free;
uint32_t w_free_head;
uint16_t w_avail_idx;
uint16_t w_last_used_idx;
uint16_t *w_extra_data;
uint16_t *w_extra_next;
uint8_t pad1[64];
/* Reader */
uint16_t r_avail_idx;
uint16_t r_last_avail_idx;
uint16_t r_last_used_idx;
uint8_t pad2[64];
};
static inline unsigned int vring_size(unsigned int num, unsigned long align)
{
return ((sizeof(struct vring_desc) * num +
sizeof(uint16_t) * (3 + num) + (align - 1)) & ~(align - 1)) +
sizeof(uint16_t) * 3 + sizeof(struct vring_used_elem) * num;
}
static inline void __smp_rmb(void)
{
#ifdef WEAK_BARRIER
__asm__ volatile("dmb ishld" : : : "memory");
#else
__asm__ volatile("dsb sy" : : : "memory");
#endif
}
static inline void __smp_wmb(void)
{
#ifdef WEAK_BARRIER
__asm__ volatile("dmb ishst" : : : "memory");
#else
__asm__ volatile("dsb sy" : : : "memory");
#endif
}
#endif /* __TEST_H */
[gshan@...an code]$ cat test.c
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <stdbool.h>
#include <stdint.h>
#include <sys/types.h>
#include <assert.h>
#include <sched.h>
#include <pthread.h>
#include "test.h"
static struct vring *vring;
static int bind_cpu(int cpuid)
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(cpuid, &cpuset);
return pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
}
static void write_free_used_desc(void)
{
uint16_t last_used;
uint32_t idx;
if ((uint16_t)(vring->used->idx - vring->w_last_used_idx) < 64)
return;
while (true) {
if (vring->w_last_used_idx == vring->used->idx)
return;
__smp_rmb();
/* Retrieve the head */
last_used = vring->w_last_used_idx & (vring->num - 1);
idx = vring->used->ring[last_used].id;
assert(idx < vring->num);
assert(vring->w_extra_data[idx]);
/* Reclaim the descriptor */
vring->w_extra_data[idx] = 0;
vring->w_extra_next[idx] = vring->w_free_head;
vring->w_free_head = idx;
/* Update statistics */
vring->w_num_free++;
vring->w_last_used_idx++;
}
}
static void write_push_desc(void)
{
uint32_t head = vring->w_free_head;
uint32_t avail_idx;
if (vring->w_num_free < 1)
return;
/*
* The data in the descriptor doesn't matter. The idea here
* is to dirty the cache line.
*/
vring->desc[head].flags = 1;
vring->desc[head].addr = 0xffffffffffffffff;
vring->desc[head].len = 0xffffffff;
vring->desc[head].next = vring->w_extra_next[head];
vring->desc[head].flags = 0;
vring->w_num_free--;
vring->w_free_head = vring->w_extra_next[head];
vring->w_extra_data[head] = 1;
avail_idx = vring->w_avail_idx & (vring->num - 1);
vring->avail->ring[avail_idx] = head;
__smp_wmb();
vring->w_avail_idx++;
vring->avail->idx = vring->w_avail_idx;
}
static void *write_worker(void *arg)
{
assert(!bind_cpu(10));
while (true) {
write_free_used_desc();
write_push_desc();
}
return NULL;
}
static void read_pull_desc(void)
{
uint16_t avail_idx, last_avail_idx;
uint32_t head;
last_avail_idx = vring->r_last_avail_idx;
if (vring->r_avail_idx == vring->r_last_avail_idx) {
vring->r_avail_idx = vring->avail->idx;
if (vring->r_avail_idx == last_avail_idx)
return;
__smp_rmb();
}
head = vring->avail->ring[last_avail_idx & (vring->num - 1)];
assert(head < vring->num);
vring->r_last_avail_idx++;
vring->used->ring[vring->r_last_used_idx & (vring->num - 1)].id = head;
vring->used->ring[vring->r_last_used_idx & (vring->num - 1)].len = 0;
vring->r_last_used_idx++;
__smp_wmb();
vring->used->idx = vring->r_last_used_idx;
}
static void *read_worker(void *arg)
{
assert(!bind_cpu(60));
while (true) {
read_pull_desc();
}
return NULL;
}
static void init_vring(unsigned int num, unsigned long align)
{
unsigned int size, i;
/* vring */
vring = malloc(sizeof(*vring));
assert(vring);
memset(vring, 0, sizeof(*vring));
/* Descriptors */
size = vring_size(num, align);
vring->desc = (struct vring_desc *)malloc(size);
assert(vring->desc);
memset(vring->desc, 0, size);
vring->avail = (struct vring_avail *)((void *)vring->desc +
num * sizeof(struct vring_desc));
vring->used = (struct vring_used *)(((unsigned long)&vring->avail->ring[num] +
sizeof(uint16_t) + (align - 1)) & ~(align - 1));
/* Writer's extra data */
vring->w_extra_data = malloc(sizeof(uint16_t) * num);
assert(vring->w_extra_data);
memset(vring->w_extra_data, 0, sizeof(uint16_t) * num);
vring->w_extra_next = malloc(sizeof(uint16_t) * num);
assert(vring->w_extra_next);
memset(vring->w_extra_next, 0, sizeof(uint16_t) * num);
for (i = 0; i < num - 1; i++)
vring->w_extra_next[i] = i + 1;
/* Statistics */
vring->num = num;
vring->w_num_free = num;
vring->w_free_head = 0;
vring->w_avail_idx = 0;
vring->w_last_used_idx = 0;
vring->r_avail_idx = 0;
vring->r_last_avail_idx = 0;
vring->r_last_used_idx = 0;
}
int main(int argc, char **argv)
{
pthread_t w_tid, r_tid;
int ret;
assert(!bind_cpu(0));
init_vring(256, 64);
ret = pthread_create(&w_tid, NULL, write_worker, NULL);
assert(!ret);
ret = pthread_create(&r_tid, NULL, read_worker, NULL);
assert(!ret);
ret = pthread_join(w_tid, NULL);
assert(!ret);
ret = pthread_join(r_tid, NULL);
assert(!ret);
while (true) {
sleep(1);
}
return 0;
}
Thanks,
Gavin
Powered by blists - more mailing lists