[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20100817195001.GA18817@linux.intel.com>
Date: Tue, 17 Aug 2010 15:50:01 -0400
From: Matthew Wilcox <willy@...ux.intel.com>
To: linux-mm@...ck.org
Cc: linux-kernel@...r.kernel.org
Subject: Re: [TESTCASE] Clean pages clogging the VM
No comment on this? Was it just that I posted it during the VM summit?
On Mon, Aug 09, 2010 at 09:30:00AM -0400, Matthew Wilcox wrote:
>
> This testcase shows some odd behaviour from the Linux VM.
>
> It creates a 1TB sparse file, mmaps it, and randomly reads locations
> in it. Due to the file being entirely sparse, the VM allocates new pages
> and zeroes them. Initially, it runs very fast, taking on the order of
> 2.7 to 4us per page fault. Eventually, the VM runs out of free pages,
> and starts doing huge amounts of work trying to figure out which of
> these clean pages to throw away. In my testing with a 6GB machine
> and 2.9GHz CPU, one in every 15,000 page faults takes over a second,
> and one in every 40,000 page faults take over seven seconds!
>
> This test-case demonstrates a problem that occurs with a read-mostly
> mmap of a file on very fast media. I wouldn't like to see a solution
> that special-cases zeroed pages. I think userspace has done its part
> to tell the kernel what's it's doing by calling madvise(MADV_RANDOM).
> This ought to be enough to hint to the kernel that it should be eagerly
> throwing away pages in this VMA.
>
>
> /*
> * Copyright (c) 2010, Intel Corporation
> * All rights reserved.
> *
> * Redistribution and use in source and binary forms, with or without
> * modification, are permitted provided that the following conditions are met:
> *
> * * Redistributions of source code must retain the above copyright notice,
> * this list of conditions and the following disclaimer.
> * * Redistributions in binary form must reproduce the above copyright notice,
> * this list of conditions and the following disclaimer in the documentation
> * and/or other materials provided with the distribution.
> * * Neither the name of Intel Corporation nor the names of its contributors
> * may be used to endorse or promote products derived from this software
> * without specific prior written permission.
> *
> * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
> * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
> * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
> * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
> * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
> * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
> * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
> * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
> * POSSIBILITY OF SUCH DAMAGE.
> */
>
> #include <assert.h>
> #include <errno.h>
> #include <fcntl.h>
> #include <math.h>
> #include <pthread.h>
> #include <signal.h>
> #include <stdint.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <sys/mman.h>
> #include <sys/stat.h>
> #include <sys/time.h>
> #include <sys/types.h>
> #include <unistd.h>
>
> #define rdtscll(val) do { \
> unsigned int __a,__d; \
> asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
> (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
> } while(0)
>
>
> #define MAX_FILE_SIZE ((off_t)1024 * 1024 * 1024 * 1024)
> #define MAX_FILE_IOS 16384
> #define MAX_LATENCY 10000000 // usecs
>
> #define NUM_IOS 1024
> #define IO_SIZE 4096
> #define BUFFER_SIZE (1024 * 1024)
>
> pthread_t tid;
> double cpu_clock;
> long long unsigned cpu_start, cpu_stop;
>
> void *mmap_test(void *arg);
> void die ();
>
> static const char usage_cmds[] =
> "usage: %s [options]\n"
> "cmd line options:\n"
> " -f file_name Read from File named 'file_name'\n"
> " -a file_size File of 'file_size' Bytes/thread\n"
> " -b buffer_size Write/Read into/from buffer of 'buffer_size' Bytes/thread\n"
> " -n num_file_ios Process 'num_file_ios' IOs\n"
> " -s io_size IO Size = 'io_size' Bytes\n"
> " -l max_latency Show latency stats based on usecs of max_latency\n"
> ;
>
> void usage(const char *program)
> {
> fprintf(stderr, usage_cmds, program);
> }
>
> off_t file_size = MAX_FILE_SIZE; // -a
> long long unsigned int buffer_size = BUFFER_SIZE; // -b
> char *filename = "sparse-file"; // -f
> int num_file_ios = NUM_IOS; // -n
> int max_latency = MAX_LATENCY; // -l
> int io_size = IO_SIZE; // -s
> long long unsigned int latency_limit;
>
> int main(int argc, char **argv)
> {
> pthread_attr_t attr;
> cpu_set_t mask;
> FILE *proc;
> char buf[256];
> double mhz = 0.0;
>
> while (1) {
> int option = getopt(argc, argv, "a:b:f:h:l:n:p:s:");
> if (option == -1) {
> break;
> }
> switch (option) {
> case 'a':
> file_size = strtoul(optarg, NULL, 0);
> printf("a: file_size:%ld Bytes :%ld MB\n", file_size, file_size/(1024*1024));
> break;
> case 'b':
> buffer_size = strtoul(optarg, NULL, 0);
> printf("b: buffer_size:%lld Bytes\n", buffer_size);
> break;
> case 'f':
> filename = optarg;
> printf("f: filename:%s\n", filename);
> break;
> case 'h':
> printf("h: options\n");
> goto help;
> case 'l':
> max_latency = strtoul(optarg, NULL, 0);
> printf("l: latency stats based on max latency:%d\n", max_latency);
> break;
> case 'n':
> num_file_ios = strtoul(optarg, NULL, 0);
> printf("n: num_file_ios:%d\n", num_file_ios);
> if (num_file_ios > MAX_FILE_IOS) {
> printf("-n %d Entered > MAX_FILE_IOS:%d\n", num_file_ios, MAX_FILE_IOS);
> exit(1);
> }
> break;
> case 's':
> io_size = strtoul(optarg, NULL, 0);
> printf("s: io_size:%d Bytes\n", io_size);
> break;
> default:
> help:
> usage(argv[0]);
> printf("default:\n");
> exit(1);
> }
> }
>
> proc = fopen("/proc/cpuinfo", "r");
> if (!proc)
> return 0.0;
>
> while (fgets(buf, sizeof buf, proc)) {
> double cpu;
>
> if (sscanf(buf, "cpu MHz : %lf", &cpu) != 1)
> continue;
> if (mhz == 0.0) {
> mhz = cpu;
> continue;
> }
> if (mhz != cpu) {
> fprintf(stderr,
> "Conflicting CPU frequency values: %lf != %lf\n",
> mhz, cpu);
> return 0.0;
> }
> }
> fclose(proc);
> printf("CPU Clock Freq from /proc/cpuinfo:%.4f\n", mhz);
> //
> // Measure CPU Core Frequnecy over 5 second period
> //
> printf("Measuring CPU Frequency......:");
> rdtscll(cpu_start);
> usleep(5000000);
> rdtscll(cpu_stop);
> cpu_clock = (double)((double)(cpu_stop-cpu_start))/(double)5.0;
> printf("%.3f\n", cpu_clock);
> latency_limit = (long long unsigned int) (cpu_clock*max_latency/1000000);
> printf("latency_limit:%llu cycles or %d usecs\n", latency_limit, max_latency);
>
> pthread_attr_init (&attr);
> pthread_attr_setscope (&attr, PTHREAD_SCOPE_SYSTEM);
> pthread_attr_setstacksize (&attr, (size_t) (1024*1024));
>
> if (pthread_create(&tid, &attr, mmap_test, (void *)(long) 0) != 0) {
> die("Thread create failed!");
> }
>
> CPU_ZERO(&mask);
> CPU_SET(0, &mask);
> if (pthread_setaffinity_np(tid, sizeof(mask), &mask) ) {
> printf("WARNING: could not set CPU Affinity, exit...\n");
> exit(1);
> }
>
> pthread_join(tid, NULL);
> sleep(1);
>
> return 0;
> }
>
>
> void die(char *string)
> {
> fprintf(stderr, "\nmmap_test: %s\n", string);
> exit(1);
> }
>
> void *mmapfile(char *fname, off_t size, int *filed)
> {
> int fd;
> void *file_addr;
> struct stat statbuf;
>
> fd = open(fname, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
> *filed = fd;
> if (fd < 0) {
> fprintf(stderr, "unable to open %s to get an FD:%s\n", fname, strerror(errno));
> exit(1);
> }
>
> fstat(fd, &statbuf);
> if (statbuf.st_size < size)
> ftruncate(fd, size);
>
> file_addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
> if (file_addr == MAP_FAILED) {
> fprintf(stderr, "datafile mmap failed: %s\n", strerror(errno));
> exit(1);
> }
>
> madvise(file_addr, size, MADV_RANDOM);
> return file_addr;
> }
>
> void create_offsets(off_t *offset_buf, int threadnum)
> {
> int i, curr_time;
>
> curr_time = time(NULL);
> srandom(curr_time / (threadnum + 1));
>
> for (i = 0; i < num_file_ios; i++) {
> double random1 = ((double)(rand()%(RAND_MAX)) / RAND_MAX);
> offset_buf[i] = file_size * random1;
> offset_buf[i] = offset_buf[i] / io_size * io_size;
> }
> }
>
> void *mmap_test(void *arg)
> {
> int threadnum = (long) arg;
> int fd;
> char *file_ptr, *file_addr;
> char *buf_ptr, *buf_addr = NULL;
> int i, j, ios;
> off_t offset_buf[MAX_FILE_IOS];
> unsigned long long latency_start, latency_stop;
>
> posix_memalign((void *)&buf_addr, 4096, buffer_size);
>
> file_addr = mmapfile(filename, file_size, &fd);
>
> ios = buffer_size/io_size;
>
> create_offsets(offset_buf, threadnum);
>
> for (j = 0; j < num_file_ios; j++) {
> buf_ptr = buf_addr;
> file_ptr = file_addr + offset_buf[j];
>
> for (i = 0; i < ios; i++) {
> rdtscll(latency_start);
> *buf_ptr = *(char *)file_ptr;
> rdtscll(latency_stop);
> printf("%lld\n", latency_stop - latency_start);
> buf_ptr += io_size;
> file_ptr += io_size;
> }
> }
>
> close(fd);
> munmap(file_addr, file_size);
> free(buf_addr);
>
> pthread_exit(NULL);
> return 0;
> }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists