From 88476f95801f7177c3a8c30c663cdd788f73a3b5 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Mon, 23 Jun 2025 10:10:37 +0800 Subject: [PATCH] memcpy_flushcache_optimized on nd_pmem.ko Signed-off-by: Dongsheng Yang --- drivers/nvdimm/pmem.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index aa50006b7616..78c2e8481630 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -122,6 +122,42 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem, return BLK_STS_OK; } +static void memcpy_flushcache_optimized(void *dest, void *source, size_t size) +{ + /* + * clflushopt performs better with block size 1024, 2048, 4096 + * non-temporal stores perform better with block size 512 + * + * block size 512 1024 2048 4096 + * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s + * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s + * + * We see that movnti performs better for 512-byte blocks, and + * clflushopt performs better for 1024-byte and larger blocks. So, we + * prefer clflushopt for sizes >= 768. + * + * NOTE: this happens to be the case now (with dm-writecache's single + * threaded model) but re-evaluate this once memcpy_flushcache() is + * enabled to use movdir64b which might invalidate this performance + * advantage seen with cache-allocating-writes plus flushing. + */ +#ifdef CONFIG_X86 + if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) && + likely(boot_cpu_data.x86_clflush_size == 64) && + likely(size >= 0)) { + do { + memcpy((void *)dest, (void *)source, 64); + clflushopt((void *)dest); + dest += 64; + source += 64; + size -= 64; + } while (size >= 64); + return; + } +#endif + memcpy_flushcache(dest, source, size); +} + static void write_pmem(void *pmem_addr, struct page *page, unsigned int off, unsigned int len) { @@ -131,7 +167,8 @@ static void write_pmem(void *pmem_addr, struct page *page, while (len) { mem = kmap_atomic(page); chunk = min_t(unsigned int, len, PAGE_SIZE - off); - memcpy_flushcache(pmem_addr, mem + off, chunk); + //memcpy_flushcache(pmem_addr, mem + off, chunk); + memcpy_flushcache_optimized(pmem_addr, mem + off, chunk); kunmap_atomic(mem); len -= chunk; off = 0; -- 2.43.0