linux-kernel - Re: Why Semaphore Hardware-Dependent?

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Pine.LNX.4.64.0608291136440.20095@schroedinger.engr.sgi.com>
Date:	Tue, 29 Aug 2006 11:41:28 -0700 (PDT)
From:	Christoph Lameter <clameter@....com>
To:	Andi Kleen <ak@...e.de>
cc:	David Howells <dhowells@...hat.com>,
	Nick Piggin <nickpiggin@...oo.com.au>,
	Arjan van de Ven <arjan@...radead.org>,
	Dong Feng <middle.fengdong@...il.com>,
	Paul Mackerras <paulus@...ba.org>,
	linux-kernel@...r.kernel.org, linux-arch@...r.kernel.org
Subject: Re: Why Semaphore Hardware-Dependent?

On Tue, 29 Aug 2006, Andi Kleen wrote:

> > Correct. However, a cmpxchg may have to acquire that cacheline multiple 
> > times in a highly contented situation.
> 
> Hmm, thinking about it that sounds unlikely because only the first and the last
> user should touch the rwsem head. But ok it might be possible.

Well that sounds encouraging.

> BTW maybe it would be a good idea to switch the wait list to a hlist,
> then the last user in the queue wouldn't need to 
> touch the cache line of the head. Or maybe even a single linked
> list then some more cache bounces might be avoidable.
> 
> That is really why we need a single C implementation. If we had that 
> such optimizations would be pretty easy. Without it it's a big mess.

I am all for optimizations....
> 
> > We have long tuned that portion of the code and therefore we are 
> > skeptical of changes. But if we cannot measure a difference to a 
> > generic implemenentation then it would be okay.
> 
> Would you be willing to run numbers comparing them? (or provide a benchmark?) 

Yes. The infamous page fault test (pft.c) really hits this one hard and on 
a large SMP system you should be able to see problems immediately. In fact 
since the page table lock scaling fixes the page fault rate 
on our large system is limited by rwsem performance due to the mmap_sem 
cacheline becoming contended. We'd be glad about any improvements in our 
numbers.


Benchmark code follows (you may have to do some tweaking for 32 
bit):

#include <pthread.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <fcntl.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>

#include <sys/mman.h>
#include <time.h>
#include <errno.h>
#include <sys/resource.h>

extern int      optind, opterr;
extern char     *optarg;

long     bytes=16384;
long     sleepsec=0;
long     verbose=0;
long     forkcnt=1;
long     repeatcount=1;
long	cachelines=1;
long     do_bzero=0;
long     mypid;
int 	title=0;

volatile int    go, state[128];

struct timespec wall,wall_end,wall_start;
struct rusage ruse;
long faults;
long pages;
long gbyte;
double faults_per_sec;
double faults_per_sec_per_cpu;

#define perrorx(s)      (perror(s), exit(1))
#define NBPP            16384

void* test(void*);
void  launch(void);


main (int argc, char *argv[])
{
        int                     i, j, c, stat, er=0;
        static  char            optstr[] = "b:c:f:g:r:s:vzHt";


        opterr=1;
        while ((c = getopt(argc, argv, optstr)) != EOF)
                switch (c) {
                case 'g':
                        bytes = atol(optarg)*1024*1024*1024;
                        break;
                case 'b':
                        bytes = atol(optarg);
                        break;
		case 'c':
			cachelines = atol(optarg);
                case 'f':
                        forkcnt = atol(optarg);
                        break;
                case 'r':
                        repeatcount = atol(optarg);
                        break;
                case 's':
                        sleepsec = atol(optarg);
                        break;
                case 'v':
                        verbose++;
                        break;
                case 'z':
                        do_bzero++;
                        break;
                case 'H':
                        er++;
                        break;
		case 't' :
			title++;
			break;
                case '?':
                        er = 1;
                        break;
                }

        if (er) {
                printf("usage: %s %s\n", argv[0], optstr);
                exit(1);
        }

	pages = bytes*repeatcount/getpagesize();
	gbyte = bytes/(1024*1024*1024);
	bytes = bytes/forkcnt;

	clock_gettime(CLOCK_REALTIME,&wall_start);

	if (verbose) printf("Calculated pages=%ld pagesize=%ld.\n",pages,getpagesize());

        mypid = getpid();
        setpgid(0, mypid);

        for (i=0; i<repeatcount; i++) {
                if (fork() == 0)
                        launch();
                while (wait(&stat) > 0);
        }
	
	getrusage(RUSAGE_CHILDREN,&ruse);
	clock_gettime(CLOCK_REALTIME,&wall_end);
	wall.tv_sec = wall_end.tv_sec - wall_start.tv_sec;
	wall.tv_nsec = wall_end.tv_nsec - wall_start.tv_nsec;
	if (wall.tv_nsec <0 )		{ wall.tv_sec--; wall.tv_nsec += 1000000000; }
	if (wall.tv_nsec >1000000000)	{ wall.tv_sec++; wall.tv_nsec -= 1000000000; }
	if (verbose) printf("Calculated faults=%ld. Real minor faults=%ld, major faults=%ld\n",pages,ruse.ru_minflt+ruse.ru_majflt);
	faults_per_sec=(double) pages / ((double) wall.tv_sec + (double) wall.tv_nsec / 1000000000.0);
	faults_per_sec_per_cpu=(double) pages /  (
		(double) (ruse.ru_utime.tv_sec + ruse.ru_stime.tv_sec) + ((double) (ruse.ru_utime.tv_usec + ruse.ru_stime.tv_usec) / 1000000.0));
	if (title) printf(" Gb Rep Thr CLine  User      System   Wall  flt/cpu/s fault/wsec\n");
	printf("%3ld %2ld %4ld %3ld %4ld.%02lds%7ld.%02lds%4ld.%02lds%10.3f %10.3f\n",
		gbyte,repeatcount,forkcnt,cachelines,
		ruse.ru_utime.tv_sec,ruse.ru_utime.tv_usec/10000,
		ruse.ru_stime.tv_sec,ruse.ru_stime.tv_usec/10000,
		wall.tv_sec,wall.tv_nsec/10000000,
		faults_per_sec_per_cpu,faults_per_sec);
        exit(0);
}

char *
do_shm(long shmlen) {
        char    *p;
        int     shmid;

        printf ("Try to allocate TOTAL shm segment of %ld bytes\n", shmlen);

        if ((shmid = shmget(IPC_PRIVATE, shmlen, SHM_R|SHM_W))  == -1)
                perrorx("shmget faiiled");

        p=(char*)shmat(shmid, (void*)0, SHM_R|SHM_W);
	printf("  created, adr: 0x%lx\n", (long)p);
	printf("  attached\n");
        bzero(p, shmlen);
	printf("  zeroed\n");

        // if (shmctl(shmid,IPC_RMID,0) == -1)
        //        perrorx("shmctl failed");
	// printf("  deleted\n");
	
	return p;


}

void
launch()
{
        pthread_t                       ptid[128];
        int     i, j;

        for (j=0; j<forkcnt; j++)
                if (pthread_create(&ptid[j], NULL, test, (void*) (long)j) < 0)
                        perrorx("pthread create");

        if(0) for (j=0; j<forkcnt; j++)
                while(state[j] == 0);
        go = 1;
        if(0) for (j=0; j<forkcnt; j++)
                while(state[j] == 1);
        for (j=0; j<forkcnt; j++)
                pthread_join(ptid[j], NULL);
        exit(0);
}

void*
test(void *arg)
{
        char    *p, *pe;
        long    id;
	int cl;
         
        id = (long) arg;
        state[id] = 1;
        while(!go);
        p = malloc(bytes);
        // p = do_shm(bytes);
	if (p == 0) {
	    printf("malloc of %Ld bytes failed.\n",bytes);
	    exit;
	} else 
	    if (verbose) printf("malloc of %Ld bytes succeeded\n",bytes);
        if (do_bzero)
                bzero(p, bytes);
        else {
                for(pe=p+bytes; p<pe; p+=16384)
			for(cl=0; cl<cachelines;cl++)
                        	p[cl*128] = 'r';
        }
        sleep(sleepsec);
        state[id] = 2;
        pthread_exit(0);
}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/