linux-kernel - Re: [PATCH v8 8/8] x86/tlb: do flush_tlb_kernel

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4FE2B047.503@intel.com>
Date:	Thu, 21 Jun 2012 13:25:27 +0800
From:	Alex Shi <alex.shi@...el.com>
To:	Andi Kleen <ak@...ux.intel.com>, linux-tegra@...r.kernel.org,
	linux-omap@...r.kernel.org
CC:	tglx@...utronix.de, mingo@...hat.com, hpa@...or.com, arnd@...db.de,
	rostedt@...dmis.org, fweisbec@...il.com, jeremy@...p.org,
	seto.hidetoshi@...fujitsu.com, borislav.petkov@....com,
	tony.luck@...el.com, luto@....edu, riel@...hat.com, avi@...hat.com,
	len.brown@...el.com, tj@...nel.org, akpm@...ux-foundation.org,
	cl@...two.org, jbeulich@...e.com, eric.dumazet@...il.com,
	akinobu.mita@...il.com, cpw@....com, penberg@...nel.org,
	steiner@....com, viro@...iv.linux.org.uk,
	kamezawa.hiroyu@...fujitsu.com, aarcange@...hat.com,
	rientjes@...gle.com, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v8 8/8] x86/tlb: do flush_tlb_kernel_range by 'invlpg'

On 06/14/2012 09:26 AM, Alex Shi wrote:

> On 06/14/2012 09:10 AM, Alex Shi wrote:
> 
>> On 06/13/2012 10:56 PM, Andi Kleen wrote:
>>
>>> On Tue, Jun 12, 2012 at 05:06:45PM +0800, Alex Shi wrote:
>>>> This patch do flush_tlb_kernel_range by 'invlpg'. The performance pay
>>>> and gain was analysed in my patch (x86/flush_tlb: try flush_tlb_single
>>>> one by one in flush_tlb_range). Now we move this logical into kernel
>>>> part. The pay is multiple 'invlpg' execution cost, that is same. but
>>>>  the gain(cost reducing of TLB entries refilling) is absolutely
>>>> increased.
>>>
>>> The subtle point is whether INVLPG flushes global pages or not.
>>> After some digging I found a sentence in the SDM that says it does.
>>> So it may be safe.
>>
>>
>> Many thanks for your time!
>>
>>>
>>> What does it improve?
>>
>>




I just write a rough kernel modules that alloc some page arrays in kernel and then map to vaddr by 'vmap'. 

Then my macro benchmark inject a 'unmap_kernel_range' request from a sysfs interface, and doing random memory access in user level during the time.

On my NHM EP 2P * 4 Cores * HT.

Without this patch, the memory access with 4 threads is ~12ns/time.
With this patch, the memory access with 4 threads is ~9ns/time.

With threads number increasing the benefit becomes small and nearly disappeared after thread number up to 256.

But no any regression. 


The rough user macro-benchmark and kernel module is here:

--- kernel module--

#include <linux/init.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/kernel.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/gfp.h>
#include <linux/fs.h>
#include <linux/mman.h>
#include <linux/uaccess.h>
#include <linux/sysfs.h>
#include <linux/hrtimer.h>
#include <linux/device.h>
#include <linux/cpu.h>

MODULE_LICENSE("Dual BSD/GPL");

/* 
 * $cat Makefile 
 * obj-m := modvmalloc.o
 *
 * compile command:
 *  #cd linux; make /home/alexs/exec/modules/modvmalloc.ko 
 */
#define NR_PAGES	(4)
#define NR_BLOCKS	(1024)

struct block {
	struct page ** page_array; 
	void *vaddr;
	int page_count;
};
struct block *block;

static int blocks = NR_BLOCKS;
module_param(blocks, uint, 0400);
MODULE_PARM_DESC(blocks, "map unmap blocks number ");

static struct page **relay_alloc_page_array(unsigned int nr_pages) 
{ 
	const size_t pa_size = NR_PAGES * sizeof(struct page *); 
	if (pa_size > PAGE_SIZE) 
		return vzalloc(pa_size); 
	return kzalloc(pa_size, GFP_KERNEL); 
} 

static void relay_free_page_array(struct page **array) 
{ 
	if (is_vmalloc_addr(array)) 
		vfree(array); 
	else
		kfree(array);
}

static void vmap_unmap(void)
{
	//purge_vmap_area_lazy();
	//vm_unmap_aliases();
	int i;
	for (i=0; i< blocks; i++)
		unmap_kernel_range((unsigned long)(block->vaddr), NR_PAGES*PAGE_SIZE);
}

// ---------------
long vmap_num = 0;

static ssize_t __vmap_num_store(const char *buf,
		size_t count, int smt)
{
	long factor = 0;
	long i;
	unsigned long start, stop;

	if (sscanf(buf, "%ld", &factor) != 1)
		return -EINVAL;

	vmap_num = factor;
	start = ktime_to_ns(ktime_get());

	vmap_unmap();

	stop = ktime_to_ns(ktime_get());
	i = blocks;
	printk(KERN_ERR "vunmap %ld times cost %ld ns/time\n", 
			i, (stop - start)/i);
	return count;
}

static ssize_t vmap_num_show(struct device *dev,
		struct device_attribute *attr,
		char *buf)
{
	return sprintf(buf, "%ld\n", vmap_num);
}
static ssize_t vmap_num_store(struct device *dev,
		struct device_attribute *attr,
		const char *buf, size_t count)
{
	return __vmap_num_store(buf, count, 0);
}

DEVICE_ATTR(vmap_num, 0644,
		vmap_num_show,
		vmap_num_store);

int create_sysfs_vmap_num(struct device *dev)
{
	return device_create_file(dev, &dev_attr_vmap_num);
}

static int mapunmap_init(void){
	long i,j,k;

	create_sysfs_vmap_num(cpu_subsys.dev_root);
	block = kmalloc(sizeof(struct block)*blocks, GFP_KERNEL);

	for (k=0; k< blocks; k++) {
		block[k].page_count = 0;
		block[k].page_array = relay_alloc_page_array(NR_PAGES);
		if (!block[k].page_array)
			return -1;

		for (i = 0; i < NR_PAGES; i++) {
			block[k].page_array[i] = alloc_page(GFP_KERNEL);
			if (unlikely(!block[k].page_array[i])) {
				printk(KERN_ERR "\talloc page error \n");
				goto depopulate;
			}
		}

		if (i!=NR_PAGES)	goto depopulate;

		block[k].page_count = i;
		block[k].vaddr = vmap(block[k].page_array, NR_PAGES, VM_MAP, PAGE_KERNEL);
		if (!(block[k].vaddr)) {
			printk(KERN_ERR "\t\t vmap error !\n");
			goto depopulate;
		}
	}
	printk(KERN_INFO "vmalloc module init OK \n");
	return 0;

depopulate:
	for (i=0; i< k; i++)
		if (block[i].page_count !=0) {
			for (j = 0; j < block[i].page_count; j++)
				__free_page((block[j].page_array[j]));
			relay_free_page_array(block[j].page_array);
		}
	printk(KERN_INFO "vmalloc module init fail\n");
	return -1;
}


static void mapunmap_exit(void){
	long i, j;

	printk(KERN_INFO "bye! this is test module\n");
	device_remove_file(cpu_subsys.dev_root, &dev_attr_vmap_num);

	for (i=0; i< blocks; i++)
		if (block[i].page_count !=0) {
			for (j = 0; j < block[i].page_count; j++)
				__free_page((block[j].page_array[j]));
			relay_free_page_array(block[j].page_array);
		}
}


module_init(mapunmap_init);
module_exit(mapunmap_exit);

--- benchmark ---

/*
   maccess.c
   This is a macrobenchmark for TLB flush range testing.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

   Copyright (C) Intel 2012
   Coypright Alex Shi alex.shi@...el.com 

   gcc -o maccess maccess.c -lrt -lpthread -O2

    #perf stat -e r881,r882,r884 -e r801,r802,r810,r820,r840,r880,r807 -e rc01 -e r4901,r4902,r4910,r4920,r4940,r4980 -e r5f01  -e rbd01,rdb20  -e r4f02 -e r8004,r8201,r8501,r8502,r8504,r8510,r8520,r8540,r8580  -e rae01,rc820,rc102,rc900 -e r8600  -e rcb10  ./maccess 
*/

#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/mman.h>
#include <time.h>
#include <sys/types.h>
#include <pthread.h>

#define FILE_SIZE	(1024*1024*1024)

#define PAGE_SIZE 	(4096)
#define HPAGE_SIZE 	(4096*512)

#ifndef MAP_HUGETLB
#define MAP_HUGETLB	0x40000
#endif


long getnsec(clockid_t clockid) {
        struct timespec ts;
        if (clock_gettime(clockid, &ts) == -1)
                perror("clock_gettime failed");
        return (long) ts.tv_sec * 1000000000 + (long) ts.tv_nsec;
}

//data for threads
struct data{
	int pagenum;
	void *startaddr;
	int rw;
	int loop;
};
volatile int * threadstart;
//thread for memory accessing
void *accessmm(void *data){
	struct data *d = data;
	long *actimes;
	char x;
	int i, k;
	int randn[PAGE_SIZE];
	
	for (i=0;i<PAGE_SIZE; i++)
		randn[i] = rand();

	actimes = malloc(sizeof(long));

	while (*threadstart == 0 )
		usleep(1);

	if (d->rw == 0)
		for (*actimes=0; *threadstart == 1; (*actimes)++)
			for (k=0; k < d->pagenum; k++)
				x = *(volatile char *)(d->startaddr + randn[k]%FILE_SIZE); 
	else
		for (*actimes=0; *threadstart == 1; (*actimes)++)
			for (k=0; k < d->pagenum; k++)
				*(char *)(d->startaddr + randn[k]%FILE_SIZE) = 1; 
	return actimes;
}

int main(int argc, char *argv[])
{
        static  char            optstr[] = "p:w:ht:s:";
	int s = 1;	/* */
	int p = 512;	/* default accessed page number, after maccess */
	int er = 0, rw = 0, h = 0, t = 2; /* d: debug; h: use huge page; t thread number */
	int pagesize = PAGE_SIZE; /*default for regular page */
	volatile char x;
	long protindex = 0;

	int i, j, k, c;
	void *m1, *startaddr;
	unsigned long *startaddr2[1024*512];
	volatile void *tempaddr;
	clockid_t clockid = CLOCK_MONOTONIC;
	unsigned long start, stop, mptime, actime;
	int randn[PAGE_SIZE];

	pthread_t	pid[1024];
	void * res;
	struct data data;

	char command[1024];

	for (i=0;i<PAGE_SIZE; i++)
		randn[i] = rand();

        while ((c = getopt(argc, argv, optstr)) != EOF)
                switch (c) {
                case 's':
                        s = atoi(optarg);
                        break;
                case 'p':
                        p = atoi(optarg);
                        break;
                case 'h':
                        h = 1;
                        break;
                case 'w':
                        rw = atoi(optarg);
                        break;
                case 't':
                        t = atoi(optarg);
                        break;
                case '?':
                        er = 1;
                        break;
                }
        if (er) {
                printf("usage: %s %s\n", argv[0], optstr);
                exit(1);
	}

	printf("pid is %d, thread number %d active %d seconds, access page num %d\n", getpid(), t, s, p);
	if (h == 0){
		startaddr = mmap(0, FILE_SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
		pagesize = PAGE_SIZE;
	} else {
		startaddr = mmap(0, FILE_SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED | MAP_HUGETLB, -1, 0);
		pagesize = HPAGE_SIZE;
	}

	start = getnsec(clockid);
	//access whole memory, will generate many page faults 
	for (tempaddr = startaddr; tempaddr < startaddr + FILE_SIZE; tempaddr += pagesize)
		memset((char *)tempaddr, 0, 1);
        stop = getnsec(clockid);

	threadstart = malloc(sizeof(int));
	*threadstart = 0;
	data.pagenum = p; data.startaddr = startaddr; data.rw = rw;
	for (i=0; i< t; i++)
		if(pthread_create(&pid[i], NULL, accessmm, &data))
			perror("pthread create");
	//wait for randn[] filling.
	sleep(1);

	mptime = actime = 0;
	sprintf(command, "sudo sh -c 'echo %d > /sys/devices/system/cpu/vmap_num'", s);
	printf("%s\n", command);

	start = getnsec(clockid);
	//kick threads, let them running.
	*threadstart = 1;

	system(command);
	*threadstart = 0;

	stop = getnsec(clockid);
	mptime += stop - start;

	//get threads' result.
	for (i=0; i< t; i++) {
		if (pthread_join(pid[i], &res))
			perror("pthread_join");
		actime += *(long*)res;
	}
end:
	printf("maccess %ld ms, memory access %ld times/thread/ms, cost %ldns/time\n",
		 mptime/1000000, actime*p*1000000/t/mptime, mptime*t/(actime*p));
	exit(0);
}

> 
>>
>>> -Andi
>>
>>
> 
> 




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/