lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Pine.LNX.4.64.0608291136440.20095@schroedinger.engr.sgi.com>
Date:	Tue, 29 Aug 2006 11:41:28 -0700 (PDT)
From:	Christoph Lameter <clameter@....com>
To:	Andi Kleen <ak@...e.de>
cc:	David Howells <dhowells@...hat.com>,
	Nick Piggin <nickpiggin@...oo.com.au>,
	Arjan van de Ven <arjan@...radead.org>,
	Dong Feng <middle.fengdong@...il.com>,
	Paul Mackerras <paulus@...ba.org>,
	linux-kernel@...r.kernel.org, linux-arch@...r.kernel.org
Subject: Re: Why Semaphore Hardware-Dependent?

On Tue, 29 Aug 2006, Andi Kleen wrote:

> > Correct. However, a cmpxchg may have to acquire that cacheline multiple 
> > times in a highly contented situation.
> 
> Hmm, thinking about it that sounds unlikely because only the first and the last
> user should touch the rwsem head. But ok it might be possible.

Well that sounds encouraging.

> BTW maybe it would be a good idea to switch the wait list to a hlist,
> then the last user in the queue wouldn't need to 
> touch the cache line of the head. Or maybe even a single linked
> list then some more cache bounces might be avoidable.
> 
> That is really why we need a single C implementation. If we had that 
> such optimizations would be pretty easy. Without it it's a big mess.

I am all for optimizations....
> 
> > We have long tuned that portion of the code and therefore we are 
> > skeptical of changes. But if we cannot measure a difference to a 
> > generic implemenentation then it would be okay.
> 
> Would you be willing to run numbers comparing them? (or provide a benchmark?) 

Yes. The infamous page fault test (pft.c) really hits this one hard and on 
a large SMP system you should be able to see problems immediately. In fact 
since the page table lock scaling fixes the page fault rate 
on our large system is limited by rwsem performance due to the mmap_sem 
cacheline becoming contended. We'd be glad about any improvements in our 
numbers.


Benchmark code follows (you may have to do some tweaking for 32 
bit):

#include <pthread.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <fcntl.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>

#include <sys/mman.h>
#include <time.h>
#include <errno.h>
#include <sys/resource.h>

extern int      optind, opterr;
extern char     *optarg;

long     bytes=16384;
long     sleepsec=0;
long     verbose=0;
long     forkcnt=1;
long     repeatcount=1;
long	cachelines=1;
long     do_bzero=0;
long     mypid;
int 	title=0;

volatile int    go, state[128];

struct timespec wall,wall_end,wall_start;
struct rusage ruse;
long faults;
long pages;
long gbyte;
double faults_per_sec;
double faults_per_sec_per_cpu;

#define perrorx(s)      (perror(s), exit(1))
#define NBPP            16384

void* test(void*);
void  launch(void);


main (int argc, char *argv[])
{
        int                     i, j, c, stat, er=0;
        static  char            optstr[] = "b:c:f:g:r:s:vzHt";


        opterr=1;
        while ((c = getopt(argc, argv, optstr)) != EOF)
                switch (c) {
                case 'g':
                        bytes = atol(optarg)*1024*1024*1024;
                        break;
                case 'b':
                        bytes = atol(optarg);
                        break;
		case 'c':
			cachelines = atol(optarg);
                case 'f':
                        forkcnt = atol(optarg);
                        break;
                case 'r':
                        repeatcount = atol(optarg);
                        break;
                case 's':
                        sleepsec = atol(optarg);
                        break;
                case 'v':
                        verbose++;
                        break;
                case 'z':
                        do_bzero++;
                        break;
                case 'H':
                        er++;
                        break;
		case 't' :
			title++;
			break;
                case '?':
                        er = 1;
                        break;
                }

        if (er) {
                printf("usage: %s %s\n", argv[0], optstr);
                exit(1);
        }

	pages = bytes*repeatcount/getpagesize();
	gbyte = bytes/(1024*1024*1024);
	bytes = bytes/forkcnt;

	clock_gettime(CLOCK_REALTIME,&wall_start);

	if (verbose) printf("Calculated pages=%ld pagesize=%ld.\n",pages,getpagesize());

        mypid = getpid();
        setpgid(0, mypid);

        for (i=0; i<repeatcount; i++) {
                if (fork() == 0)
                        launch();
                while (wait(&stat) > 0);
        }
	
	getrusage(RUSAGE_CHILDREN,&ruse);
	clock_gettime(CLOCK_REALTIME,&wall_end);
	wall.tv_sec = wall_end.tv_sec - wall_start.tv_sec;
	wall.tv_nsec = wall_end.tv_nsec - wall_start.tv_nsec;
	if (wall.tv_nsec <0 )		{ wall.tv_sec--; wall.tv_nsec += 1000000000; }
	if (wall.tv_nsec >1000000000)	{ wall.tv_sec++; wall.tv_nsec -= 1000000000; }
	if (verbose) printf("Calculated faults=%ld. Real minor faults=%ld, major faults=%ld\n",pages,ruse.ru_minflt+ruse.ru_majflt);
	faults_per_sec=(double) pages / ((double) wall.tv_sec + (double) wall.tv_nsec / 1000000000.0);
	faults_per_sec_per_cpu=(double) pages /  (
		(double) (ruse.ru_utime.tv_sec + ruse.ru_stime.tv_sec) + ((double) (ruse.ru_utime.tv_usec + ruse.ru_stime.tv_usec) / 1000000.0));
	if (title) printf(" Gb Rep Thr CLine  User      System   Wall  flt/cpu/s fault/wsec\n");
	printf("%3ld %2ld %4ld %3ld %4ld.%02lds%7ld.%02lds%4ld.%02lds%10.3f %10.3f\n",
		gbyte,repeatcount,forkcnt,cachelines,
		ruse.ru_utime.tv_sec,ruse.ru_utime.tv_usec/10000,
		ruse.ru_stime.tv_sec,ruse.ru_stime.tv_usec/10000,
		wall.tv_sec,wall.tv_nsec/10000000,
		faults_per_sec_per_cpu,faults_per_sec);
        exit(0);
}

char *
do_shm(long shmlen) {
        char    *p;
        int     shmid;

        printf ("Try to allocate TOTAL shm segment of %ld bytes\n", shmlen);

        if ((shmid = shmget(IPC_PRIVATE, shmlen, SHM_R|SHM_W))  == -1)
                perrorx("shmget faiiled");

        p=(char*)shmat(shmid, (void*)0, SHM_R|SHM_W);
	printf("  created, adr: 0x%lx\n", (long)p);
	printf("  attached\n");
        bzero(p, shmlen);
	printf("  zeroed\n");

        // if (shmctl(shmid,IPC_RMID,0) == -1)
        //        perrorx("shmctl failed");
	// printf("  deleted\n");
	
	return p;


}

void
launch()
{
        pthread_t                       ptid[128];
        int     i, j;

        for (j=0; j<forkcnt; j++)
                if (pthread_create(&ptid[j], NULL, test, (void*) (long)j) < 0)
                        perrorx("pthread create");

        if(0) for (j=0; j<forkcnt; j++)
                while(state[j] == 0);
        go = 1;
        if(0) for (j=0; j<forkcnt; j++)
                while(state[j] == 1);
        for (j=0; j<forkcnt; j++)
                pthread_join(ptid[j], NULL);
        exit(0);
}

void*
test(void *arg)
{
        char    *p, *pe;
        long    id;
	int cl;
         
        id = (long) arg;
        state[id] = 1;
        while(!go);
        p = malloc(bytes);
        // p = do_shm(bytes);
	if (p == 0) {
	    printf("malloc of %Ld bytes failed.\n",bytes);
	    exit;
	} else 
	    if (verbose) printf("malloc of %Ld bytes succeeded\n",bytes);
        if (do_bzero)
                bzero(p, bytes);
        else {
                for(pe=p+bytes; p<pe; p+=16384)
			for(cl=0; cl<cachelines;cl++)
                        	p[cl*128] = 'r';
        }
        sleep(sleepsec);
        state[id] = 2;
        pthread_exit(0);
}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ