lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:	Wed, 22 Apr 2009 20:12:57 -0400
From:	Valerie Aurora Henson <vaurora@...hat.com>
To:	linux-fsdevel@...r.kernel.org
Cc:	linux-kernel@...r.kernel.org, Chris Mason <chris.mason@...cle.com>,
	Theodore Tso <tytso@....edu>,
	Eric Sandeen <sandeen@...hat.com>,
	Ric Wheeler <rwheeler@...hat.com>
Subject: [RFC PATCH] fpathconf() for fsync() behavior

In the default mode for ext3 and btrfs, fsync() is both slow and
unnecessary for some important application use cases - at the same
time that it is absolutely required for correctness for other modes of
ext3, ext4, XFS, etc.  If applications could easilyl distinguish
between the two cases, they would be more likely to be correct and
fast.

How about an fpathconf() variable, something like _PC_ORDERED?  E.g.:

	/* Unoptimized example optional fsync() demo */
	write(fd);
	/* Only fsync() if we need it */
	if (fpath_conf(fd, _PC_ORDERED) != 1)
		fsync(fd);
	rename(tmp_path, new_path);

I know of two specific real-world cases in which this would
significantly improve performance: (a) fsync() before rename(), (b)
fsync() of the parent directory of a newly created file.  Case (b) is
particularly nasty when you have multiple threads creating files in
the same directory because the dir's i_mutex is held across fsync() -
file creates become limited to the speed of sequential fsync()s.

Conceptual libc patch below.

-VAL

diff --git a/sysdeps/unix/sysv/linux/pathconf.c b/sysdeps/unix/sysv/linux/pathconf.c
index db03529..5b64939 100644
--- a/sysdeps/unix/sysv/linux/pathconf.c
+++ b/sysdeps/unix/sysv/linux/pathconf.c
@@ -51,6 +51,9 @@ __pathconf (const char *file, int name)
     case _PC_CHOWN_RESTRICTED:
       return __statfs_chown_restricted (__statfs (file, &fsbuf), &fsbuf);
 
+    case _PC_ORDERED:
+      return __statfs_ordered (__statfs (file, &fsbuf), &fsbuf);
+
     default:
       return posix_pathconf (file, name);
     }
@@ -225,3 +228,44 @@ __statfs_chown_restricted (int result, const struct statfs *fsbuf)
 
   return retval;
 }
+
+
+/* Tells us if write operations are ordered with respect to each
+ * other.  Useful for skipping fsync in some cases.  Default is 0 -
+ * not ordered. */
+
+/* Used like: return statfs_ordered (__statfs (name, &buf), &buf); */
+long int
+__statfs_ordered (int result, const struct statfs *fsbuf)
+{
+  if (result < 0)
+    {
+      if (errno == ENOSYS)
+	/* Not possible, return the default value.  */
+	return 0;
+
+      /* Some error occured.  */
+      return -1;
+    }
+
+#define BTRFS_SUPER_MAGIC       0x9123683E
+  switch (fsbuf->f_type)
+    {
+    case BTRFS_SUPER_MAGIC:
+    case EXT2_SUPER_MAGIC:
+	    /* XXX Must distinguish between 2, 3, and 4 */
+    case REISERFS_SUPER_MAGIC:
+	    /* XXX Nasty hacking needed here to determine exact
+	     * journaling mode.  Options include parsing /proc/mounts,
+	     * defining an ioctl(), creating a generic VFS interface.
+	     * For demonstration purposes, assume the default mode,
+	     * which is ordered for each of these file systems.
+	     */
+	    return 1;
+    case XFS_SUPER_MAGIC:
+	    /* XXX XFS has a trillion options, is there one to do ordered mode? */
+	    return 0;
+    default:
+      return 0;
+    }
+}
diff --git a/bits/confname.h b/bits/confname.h
index 80b51ac..3d19902 100644
--- a/bits/confname.h
+++ b/bits/confname.h
@@ -39,6 +39,8 @@ enum
 #define	_PC_PIPE_BUF			_PC_PIPE_BUF
     _PC_CHOWN_RESTRICTED,
 #define	_PC_CHOWN_RESTRICTED		_PC_CHOWN_RESTRICTED
+    _PC_ORDERED,
+#define	_PC_ORDERED			_PC_ORDERED
     _PC_NO_TRUNC,
 #define	_PC_NO_TRUNC			_PC_NO_TRUNC
     _PC_VDISABLE,
diff --git a/conform/data/unistd.h-data b/conform/data/unistd.h-data
index b6effa0..7325ff5 100644
--- a/conform/data/unistd.h-data
+++ b/conform/data/unistd.h-data
@@ -248,6 +248,7 @@ constant _PC_MAX_CANON
 constant _PC_MAX_INPUT
 constant _PC_NAME_MAX
 constant _PC_NO_TRUNC
+constant _PC_ORDERED
 constant _PC_PATH_MAX
 constant _PC_PIPE_BUF
 constant _PC_PRIO_IO
diff --git a/posix/annexc.c b/posix/annexc.c
index df5913a..658bdc1 100644
--- a/posix/annexc.c
+++ b/posix/annexc.c
@@ -501,7 +501,7 @@ static const char *const unistd_syms[] =
   "F_OK", "NULL", "R_OK", "SEEK_CUR", "SEEK_END", "SEEK_SET", "STDERR_FILENO",
   "STDIN_FILENO", "STDOUT_FILENO", "W_OK", "X_OK",
   "_PC_ASYNC_IO", "_PC_CHOWN_RESTRICTED", "_PC_LINK_MAX", "_PC_MAX_CANON",
-  "_PC_MAX_INPUT", "_PC_NAME_MAX", "_PC_NO_TRUNC", "_PC_PATH_MAX",
+  "_PC_MAX_INPUT", "_PC_NAME_MAX", "_PC_NO_TRUNC", "_PC_PATH_MAX", "_PC_ORDERED",
   "_PC_PIPE_BUF", "_PC_PRIO_IO", "_PC_SYNC_IO", "_PC_VDISABLE",
   "_SC_AIO_LISTIO_MAX", "_SC_AIO_MAX", "_SC_AIO_PRIO_DELTA_MAX",
   "_SC_ARG_MAX", "_SC_ASYNCHRONOUS_IO", "_SC_CHILD_MAX", "_SC_CLK_TCK",
diff --git a/posix/fpathconf.c b/posix/fpathconf.c
index 840460b..d7f9a89 100644
--- a/posix/fpathconf.c
+++ b/posix/fpathconf.c
@@ -47,6 +47,7 @@ __fpathconf (fd, name)
     case _PC_PIPE_BUF:
     case _PC_SOCK_MAXBUF:
     case _PC_CHOWN_RESTRICTED:
+    case _PC_ORDERED:
     case _PC_NO_TRUNC:
     case _PC_VDISABLE:
       break;
diff --git a/posix/getconf.c b/posix/getconf.c
index 6184292..5995d60 100644
--- a/posix/getconf.c
+++ b/posix/getconf.c
@@ -81,6 +81,9 @@ static const struct conf vars[] =
 #ifdef _PC_CHOWN_RESTRICTED
     { "_POSIX_CHOWN_RESTRICTED", _PC_CHOWN_RESTRICTED, PATHCONF },
 #endif
+#ifdef _PC_ORDERED
+    { "_POSIX_ORDERED", _PC_ORDERED, PATHCONF },
+#endif
 #ifdef _PC_NO_TRUNC
     { "_POSIX_NO_TRUNC", _PC_NO_TRUNC, PATHCONF },
 #endif
diff --git a/sysdeps/posix/fpathconf.c b/sysdeps/posix/fpathconf.c
index 605cd17..c29fa6f 100644
--- a/sysdeps/posix/fpathconf.c
+++ b/sysdeps/posix/fpathconf.c
@@ -121,6 +121,13 @@ __fpathconf (fd, name)
       return -1;
 #endif
 
+    case _PC_ORDERED:
+#ifdef	_POSIX_ORDERED
+      return _POSIX_ORDERED;
+#else
+      return -1;
+#endif
+
     case _PC_NO_TRUNC:
 #ifdef	_POSIX_NO_TRUNC
       return _POSIX_NO_TRUNC;
diff --git a/sysdeps/posix/pathconf.c b/sysdeps/posix/pathconf.c
index 75c99ee..f9d84ab 100644
--- a/sysdeps/posix/pathconf.c
+++ b/sysdeps/posix/pathconf.c
@@ -117,6 +117,13 @@ __pathconf (const char *path, int name)
       return -1;
 #endif
 
+    case _PC_ORDERED:
+#ifdef	_POSIX_ORDERED
+    return _POSIX_ORDERED;
+#else
+      return -1;
+#endif
+
     case _PC_NO_TRUNC:
 #ifdef	_POSIX_NO_TRUNC
       return _POSIX_NO_TRUNC;
diff --git a/sysdeps/unix/sysv/linux/fpathconf.c b/sysdeps/unix/sysv/linux/fpathconf.c
index 2701c9e..51c43c4 100644
--- a/sysdeps/unix/sysv/linux/fpathconf.c
+++ b/sysdeps/unix/sysv/linux/fpathconf.c
@@ -48,6 +48,9 @@ __fpathconf (fd, name)
     case _PC_CHOWN_RESTRICTED:
       return __statfs_chown_restricted (__fstatfs (fd, &fsbuf), &fsbuf);
 
+    case _PC_ORDERED:
+      return __statfs_ordered (__fstatfs (fd, &fsbuf), &fsbuf);
+
     default:
       return posix_fpathconf (fd, name);
     }
diff --git a/sysdeps/unix/sysv/linux/pathconf.h b/sysdeps/unix/sysv/linux/pathconf.h
index 806adcc..1c0b513 100644
--- a/sysdeps/unix/sysv/linux/pathconf.h
+++ b/sysdeps/unix/sysv/linux/pathconf.h
@@ -37,3 +37,6 @@ extern long int __statfs_symlinks (int result, const struct statfs *fsbuf);
 /* Used like: return __statfs_chown_restricted (__statfs (name, &buf), &buf);*/
 extern long int __statfs_chown_restricted (int result,
 					   const struct statfs *fsbuf);
+
+/* Used like: return statfs_ordered (__statfs (name, &buf), &buf); */
+extern long int __statfs_ordered (int result, const struct statfs *fsbuf);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists