netdev - Re: [Bug 106241] New: shutdown(3)/close(3) behaviour is incorrect for sockets in accept(3)

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1446122119.7476.138.camel@edumazet-glaptop2.roam.corp.google.com>
Date:	Thu, 29 Oct 2015 05:35:19 -0700
From:	Eric Dumazet <eric.dumazet@...il.com>
To:	Al Viro <viro@...IV.linux.org.uk>
Cc:	David Miller <davem@...emloft.net>, stephen@...workplumber.org,
	netdev@...r.kernel.org,
	Linus Torvalds <torvalds@...ux-foundation.org>,
	dhowells@...hat.com, linux-fsdevel@...r.kernel.org
Subject: Re: [Bug 106241] New: shutdown(3)/close(3) behaviour is incorrect
 for sockets in accept(3)

On Thu, 2015-10-29 at 04:16 +0000, Al Viro wrote:

> Have you tried to experiment with that in userland?  I mean, emulate that
> thing in normal userland code, count the cacheline accesses and drive it
> with the use patterns collected from actual applications.

Sure.

> 
> I can sit down and play with math expectations, but I suspect that it's
> easier to experiment.  It's nothing but an intuition (I hadn't seriously
> done probability theory in quite a while, and my mathematical tastes run
> more to geometry and topology anyway), but... I would expect it to degrade
> badly when the bitmap is reasonably dense.
> 
> Note, BTW, that vmalloc'ed memory gets populated as you read it, and it's
> not cheap - it's done via #PF triggered in kernel mode, with handler
> noticing that the faulting address is in vmalloc range and doing the
> right thing.  IOW, if your bitmap is very sparse, the price of page faults
> needs to be taken into account.

This vmalloc PF is pure noise.
This only matters for the very first allocations.

We target programs opening zillions of fd in their lifetime ;)

Not having to expand a 4,000,000 slots fd array while fully loaded also
removes a latency spike that is very often not desirable.


> 
> AFAICS, the only benefit of that thing is keeping dirtied cachelines far
> from each other.  Which might be a win overall, but I'm not convinced that
> the rest won't offset the effect of that...

Well, I already tested the O_FD_FASTALLOC thing, and I can tell you
find_next_zero_bit() is nowhere to be found in kernel profiles anymore.
It also lowers time we hold the fd array spinlock while doing fd alloc.

User land test program I wrote few months back

Current kernel :

    64.98%  [kernel]          [k] queued_spin_lock_slowpath    
    14.88%  opensock          [.] memset    // this part simulates user land actual work ;)                   
    11.15%  [kernel]          [k] _find_next_bit.part.0        
     0.69%  [kernel]          [k] _raw_spin_lock               
     0.46%  [kernel]          [k] memset_erms                  
     0.38%  [kernel]          [k] sk_alloc                     
     0.37%  [kernel]          [k] kmem_cache_alloc             
     0.33%  [kernel]          [k] get_empty_filp               
     0.31%  [kernel]          [k] kmem_cache_free              
     0.26%  [kernel]          [k] __alloc_fd                   
     0.26%  opensock          [.] child_function               
     0.18%  [kernel]          [k] inode_init_always            
     0.17%  opensock          [.] __random_r                   


/*
 * test for b/9072743 : fd scaling on gigantic process (with ~ 10,000,000 TCP sockets)
 * - Size fd arrays in kernel to avoid resizings that kill latencies.
 * - Then launch xx threads doing
 *    populate the fd array of the process, opening 'max' files.
 *    
 *    - Loop : close(randomfd()), socket(AF_INET, SOCK_STREAM, 0);
 *
 * Usage : opensock [ -n fds_count ] [ -t threads_count] [-f]
 */

#include <pthread.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <stdlib.h>
#include <errno.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>

unsigned int count;
int skflags;

#define NBTHREADS_MAX 4096
pthread_t tid[NBTHREADS_MAX];
int nbthreads;
int nbthreads_req = 24;
int stop_all;

#ifndef O_FD_FASTALLOC
#define O_FD_FASTALLOC 0x40000000
#endif

#ifndef SOCK_FD_FASTALLOC
#define SOCK_FD_FASTALLOC O_FD_FASTALLOC
#endif

/* expand kernel fd array for optimal perf.
 * This could be done by doing a loop on dup(),
 * or can be done using dup2()
 */
int expand_fd_array(int max)
{
	int target, res;
	int fd = socket(AF_INET, SOCK_STREAM, 0);

	if (fd == -1) {
		perror("socket()");
		return -1;
	}
	for (;;) {
		count = max;
		target = count;
		if (skflags & SOCK_FD_FASTALLOC)
			target += count/10;
		res = dup2(fd, target);
		if (res != -1) {
			close(res);
			break;
		}
		max -= max/10;
	}
	printf("count=%u (check/increase ulimit -n)\n", count);
	return 0;
}

static char state[32] = {
	0, 1, 2, 3, 4, 5, 6, 7,
	8, 9, 10, 11, 12, 13, 14, 15,
	16, 17, 18, 19, 20, 21, 22, 23,
	24, 25, 26, 27, 28, 29, 30, 31
};

/* each thread is using ~400 KB of data per unit of work */
#define WORKING_SET_SIZE 400000

static void *child_function(void *arg)
{
	unsigned int max = count / nbthreads_req;
	struct random_data buf;
	unsigned int idx;
	int *tab;
	unsigned long iter = 0;
	unsigned long *work_set = malloc(WORKING_SET_SIZE);
	int i;

	if (!work_set)
		return NULL;
	tab = malloc(max * sizeof(int));
	if (!tab) {
		free(work_set);
		return NULL;
	}
	memset(tab, 255, max * sizeof(int));
	
	initstate_r(getpid(), state, sizeof(state), &buf);

	tab[0] = socket(AF_INET, SOCK_STREAM | skflags, 0);
	for (i = 1; i < max; i++)
		tab[i] = dup(tab[0]);

	while (!stop_all) {
		random_r(&buf, &idx);
		idx = idx % max;
		close(tab[idx]);

		/* user space needs typically to use a bit of the memory. */
		memset(work_set, idx, WORKING_SET_SIZE);

		tab[idx] = socket(AF_INET, SOCK_STREAM | skflags, 0);
		if (tab[idx] == -1) {
			perror("socket");
			break;
		}
		iter++;
	}
	for (i = 0; i < max; i++)
		close(tab[idx]);
	free(tab);
	free(work_set);
	printf("%lu\n", iter);
	return NULL;
}

static int launch_threads(void)
{
	int i, err;

	for (i = 0; i < nbthreads_req; i++) {
		err = pthread_create(&tid[i], NULL, child_function, NULL);
		if (err)
			return err;
		nbthreads++;
	}
	return 0;
}

static void wait_end(void)
{
	int i;
	for (i = 0; i < nbthreads; i++)
		pthread_join(tid[i], NULL);
}

static void usage(int code)
{
	fprintf(stderr, "Usage : opensock [ -n fds_count ] [ -t threads_count] [-f]\n");
	exit(code);
}

int main(int argc, char *argv[])
{
	int c;
	int max = 1000000;
	int duration = 10;

	while ((c = getopt(argc, argv, "fn:t:l:")) != -1) {
		switch (c) {
		case 'f':
			skflags = SOCK_FD_FASTALLOC;
			break;
		case 'n':
			max = atoi(optarg);
			break;
		case 't':
			nbthreads_req = atoi(optarg);
			if (nbthreads_req > NBTHREADS_MAX)
				usage(1);
			break;
		case 'l':
			duration = atoi(optarg);
			break;
		default:
			usage(1);
		}
	}
	system("sysctl -w fs.file-max=8000000");
	expand_fd_array(max);
	launch_threads();
	sleep(duration);
	stop_all = 1;
	wait_end();
}


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html