lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1446126526.7476.145.camel@edumazet-glaptop2.roam.corp.google.com>
Date:	Thu, 29 Oct 2015 06:48:46 -0700
From:	Eric Dumazet <eric.dumazet@...il.com>
To:	Al Viro <viro@...IV.linux.org.uk>
Cc:	David Miller <davem@...emloft.net>, stephen@...workplumber.org,
	netdev@...r.kernel.org,
	Linus Torvalds <torvalds@...ux-foundation.org>,
	dhowells@...hat.com, linux-fsdevel@...r.kernel.org
Subject: Re: [Bug 106241] New: shutdown(3)/close(3) behaviour is incorrect
 for sockets in accept(3)

On Thu, 2015-10-29 at 05:35 -0700, Eric Dumazet wrote:

> Current kernel :
> 
>     64.98%  [kernel]          [k] queued_spin_lock_slowpath    
>     14.88%  opensock          [.] memset    // this part simulates user land actual work ;)                   
>     11.15%  [kernel]          [k] _find_next_bit.part.0        
>      0.69%  [kernel]          [k] _raw_spin_lock               
>      0.46%  [kernel]          [k] memset_erms                  
>      0.38%  [kernel]          [k] sk_alloc                     
>      0.37%  [kernel]          [k] kmem_cache_alloc             
>      0.33%  [kernel]          [k] get_empty_filp               
>      0.31%  [kernel]          [k] kmem_cache_free              
>      0.26%  [kernel]          [k] __alloc_fd                   
>      0.26%  opensock          [.] child_function               
>      0.18%  [kernel]          [k] inode_init_always            
>      0.17%  opensock          [.] __random_r                   

With attached prototype patch we get this profile instead :

You can see we no longer hit the spinlock issue and cache waste
in find_next_bit.

Userland can really progress _much_ faster.

    76.86%  opensock          [.] memset                        
     1.31%  [kernel]          [k] _raw_spin_lock                
     1.15%  assd              [.] 0x000000000056f32c            
     1.08%  [kernel]          [k] kmem_cache_free               
     0.97%  [kernel]          [k] kmem_cache_alloc              
     0.83%  [kernel]          [k] sk_alloc                      
     0.72%  [kernel]          [k] memset_erms                   
     0.70%  opensock          [.] child_function                
     0.67%  [kernel]          [k] get_empty_filp                
     0.65%  [kernel]          [k] __alloc_fd                    
     0.58%  [kernel]          [k] __close_fd                    
     0.49%  [kernel]          [k] queued_spin_lock_slowpath     

diff --git a/fs/file.c b/fs/file.c
index 6c672ad329e9..eabb9a626259 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -22,6 +22,7 @@
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/workqueue.h>
+#include <linux/random.h>
 
 int sysctl_nr_open __read_mostly = 1024*1024;
 int sysctl_nr_open_min = BITS_PER_LONG;
@@ -471,6 +472,19 @@ int __alloc_fd(struct files_struct *files,
 	spin_lock(&files->file_lock);
 repeat:
 	fdt = files_fdtable(files);
+
+	if (unlikely(flags & O_FD_FASTALLOC)) {
+		u32 rnd, limit = min(end, fdt->max_fds);
+
+		/*
+		 * Note: do not bother with files->next_fd,
+		 * this is for POSIX lovers...
+		 */
+		rnd = ((u64)prandom_u32() * limit) >> 32;
+		fd = find_next_zero_bit(fdt->open_fds, limit, rnd);
+		if (fd < limit)
+			goto ok;
+	}
 	fd = start;
 	if (fd < files->next_fd)
 		fd = files->next_fd;
@@ -499,7 +513,7 @@ repeat:
 
 	if (start <= files->next_fd)
 		files->next_fd = fd + 1;
-
+ok:
 	__set_open_fd(fd, fdt);
 	if (flags & O_CLOEXEC)
 		__set_close_on_exec(fd, fdt);
diff --git a/include/linux/net.h b/include/linux/net.h
index 70ac5e28e6b7..3823d082af4c 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -76,6 +76,7 @@ enum sock_type {
 #ifndef SOCK_NONBLOCK
 #define SOCK_NONBLOCK	O_NONBLOCK
 #endif
+#define SOCK_FD_FASTALLOC O_FD_FASTALLOC
 
 #endif /* ARCH_HAS_SOCKET_TYPES */
 
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index e063effe0cc1..badd421dd9f4 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -88,6 +88,10 @@
 #define __O_TMPFILE	020000000
 #endif
 
+#ifndef O_FD_FASTALLOC
+#define O_FD_FASTALLOC 0x40000000
+#endif
+
 /* a horrid kludge trying to make sure that this will fail on old kernels */
 #define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
 #define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT)      
diff --git a/net/socket.c b/net/socket.c
index 9963a0b53a64..6dde02b2eaf9 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1227,9 +1227,10 @@ SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
 	BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
 	BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
 	BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
+	BUILD_BUG_ON(SOCK_FD_FASTALLOC & SOCK_TYPE_MASK);
 
 	flags = type & ~SOCK_TYPE_MASK;
-	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK | SOCK_FD_FASTALLOC))
 		return -EINVAL;
 	type &= SOCK_TYPE_MASK;
 
@@ -1240,7 +1241,7 @@ SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
 	if (retval < 0)
 		goto out;
 
-	retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
+	retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK | O_FD_FASTALLOC));
 	if (retval < 0)
 		goto out_release;
 
@@ -1266,7 +1267,7 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,
 	int flags;
 
 	flags = type & ~SOCK_TYPE_MASK;
-	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK | SOCK_FD_FASTALLOC))
 		return -EINVAL;
 	type &= SOCK_TYPE_MASK;
 
@@ -1436,7 +1437,7 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
 	int err, len, newfd, fput_needed;
 	struct sockaddr_storage address;
 
-	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK | SOCK_FD_FASTALLOC))
 		return -EINVAL;
 
 	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ