linux-ext4 - [PATCH v7 04/10] unicode: Implement higher level API for string handling

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20190413192533.2549-5-krisman@collabora.com>
Date:   Sat, 13 Apr 2019 15:25:27 -0400
From:   Gabriel Krisman Bertazi <krisman@...labora.com>
To:     tytso@....edu
Cc:     linux-ext4@...r.kernel.org,
        Gabriel Krisman Bertazi <krisman@...labora.co.uk>
Subject: [PATCH v7 04/10] unicode: Implement higher level API for string handling

From: Gabriel Krisman Bertazi <krisman@...labora.co.uk>

This patch integrates the utf8n patches with some higher level API to
perform UTF-8 string comparison, normalization and casefolding
operations.  Implemented is a variation of NFD, and casefold is
performed by doing full casefold on top of NFD.  These algorithms are
based on the core implemented by Olaf Weber from SGI.

Changes since v6:
  - Fix sparse warnings (0-day)

Changes since v4:
  - integrate in fs/unicode

Changes since RFC v1:
  - Change error return code from EIO to EINVAL. (Olaf Weber)
  - Fix issues with strncmp/strcmp.  (Olaf Weber)
  - Remove stack buffer in normalization/casefold. (Olaf Weber)
  - Include length parameter for second string on comparison functions.
  - Change length type to size_t.

Signed-off-by: Gabriel Krisman Bertazi <krisman@...labora.co.uk>
---
 fs/unicode/Makefile     |   4 +-
 fs/unicode/utf8-core.c  | 187 ++++++++++++++++++++++++++++++++++++++++
 fs/unicode/utf8-norm.c  |   6 ++
 fs/unicode/utf8n.h      |   1 +
 include/linux/unicode.h |  30 +++++++
 5 files changed, 227 insertions(+), 1 deletion(-)
 create mode 100644 fs/unicode/utf8-core.c
 create mode 100644 include/linux/unicode.h

diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index 16d43d180416..bfb0360687df 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 
-obj-$(CONFIG_UNICODE) += utf8-norm.o
+obj-$(CONFIG_UNICODE) += unicode.o
+
+unicode-y := utf8-norm.o utf8-core.o
 
 # This rule is not invoked during the kernel compilation.  It is used to
 # regenerate the utf8data.h header file.
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
new file mode 100644
index 000000000000..6afab4fdce90
--- /dev/null
+++ b/fs/unicode/utf8-core.c
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/errno.h>
+#include <linux/unicode.h>
+
+#include "utf8n.h"
+
+int utf8_validate(const struct unicode_map *um, const struct qstr *str)
+{
+	const struct utf8data *data = utf8nfdi(um->version);
+
+	if (utf8nlen(data, str->name, str->len) < 0)
+		return -1;
+	return 0;
+}
+EXPORT_SYMBOL(utf8_validate);
+
+int utf8_strncmp(const struct unicode_map *um,
+		 const struct qstr *s1, const struct qstr *s2)
+{
+	const struct utf8data *data = utf8nfdi(um->version);
+	struct utf8cursor cur1, cur2;
+	int c1, c2;
+
+	if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+		return -EINVAL;
+
+	if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+		return -EINVAL;
+
+	do {
+		c1 = utf8byte(&cur1);
+		c2 = utf8byte(&cur2);
+
+		if (c1 < 0 || c2 < 0)
+			return -EINVAL;
+		if (c1 != c2)
+			return 1;
+	} while (c1);
+
+	return 0;
+}
+EXPORT_SYMBOL(utf8_strncmp);
+
+int utf8_strncasecmp(const struct unicode_map *um,
+		     const struct qstr *s1, const struct qstr *s2)
+{
+	const struct utf8data *data = utf8nfdicf(um->version);
+	struct utf8cursor cur1, cur2;
+	int c1, c2;
+
+	if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+		return -EINVAL;
+
+	if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+		return -EINVAL;
+
+	do {
+		c1 = utf8byte(&cur1);
+		c2 = utf8byte(&cur2);
+
+		if (c1 < 0 || c2 < 0)
+			return -EINVAL;
+		if (c1 != c2)
+			return 1;
+	} while (c1);
+
+	return 0;
+}
+EXPORT_SYMBOL(utf8_strncasecmp);
+
+int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
+		  unsigned char *dest, size_t dlen)
+{
+	const struct utf8data *data = utf8nfdicf(um->version);
+	struct utf8cursor cur;
+	size_t nlen = 0;
+
+	if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+		return -EINVAL;
+
+	for (nlen = 0; nlen < dlen; nlen++) {
+		int c = utf8byte(&cur);
+
+		dest[nlen] = c;
+		if (!c)
+			return nlen;
+		if (c == -1)
+			break;
+	}
+	return -EINVAL;
+}
+
+EXPORT_SYMBOL(utf8_casefold);
+
+int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
+		   unsigned char *dest, size_t dlen)
+{
+	const struct utf8data *data = utf8nfdi(um->version);
+	struct utf8cursor cur;
+	ssize_t nlen = 0;
+
+	if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+		return -EINVAL;
+
+	for (nlen = 0; nlen < dlen; nlen++) {
+		int c = utf8byte(&cur);
+
+		dest[nlen] = c;
+		if (!c)
+			return nlen;
+		if (c == -1)
+			break;
+	}
+	return -EINVAL;
+}
+
+EXPORT_SYMBOL(utf8_normalize);
+
+static int utf8_parse_version(const char *version, unsigned int *maj,
+			      unsigned int *min, unsigned int *rev)
+{
+	substring_t args[3];
+	char version_string[12];
+	const struct match_token token[] = {
+		{1, "%d.%d.%d"},
+		{0, NULL}
+	};
+
+	strncpy(version_string, version, sizeof(version_string));
+
+	if (match_token(version_string, token, args) != 1)
+		return -EINVAL;
+
+	if (match_int(&args[0], maj) || match_int(&args[1], min) ||
+	    match_int(&args[2], rev))
+		return -EINVAL;
+
+	return 0;
+}
+
+struct unicode_map *utf8_load(const char *version)
+{
+	struct unicode_map *um = NULL;
+	int unicode_version;
+
+	if (version) {
+		unsigned int maj, min, rev;
+
+		if (utf8_parse_version(version, &maj, &min, &rev) < 0)
+			return ERR_PTR(-EINVAL);
+
+		if (!utf8version_is_supported(maj, min, rev))
+			return ERR_PTR(-EINVAL);
+
+		unicode_version = UNICODE_AGE(maj, min, rev);
+	} else {
+		unicode_version = utf8version_latest();
+		printk(KERN_WARNING"UTF-8 version not specified. "
+		       "Assuming latest supported version (%d.%d.%d).",
+		       (unicode_version >> 16) & 0xff,
+		       (unicode_version >> 8) & 0xff,
+		       (unicode_version & 0xff));
+	}
+
+	um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
+	if (!um)
+		return ERR_PTR(-ENOMEM);
+
+	um->charset = "UTF-8";
+	um->version = unicode_version;
+
+	return um;
+}
+EXPORT_SYMBOL(utf8_load);
+
+void utf8_unload(struct unicode_map *um)
+{
+	kfree(um);
+}
+EXPORT_SYMBOL(utf8_unload);
+
+MODULE_LICENSE("GPL v2");
diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c
index 848b93e97f50..20d440c3f2db 100644
--- a/fs/unicode/utf8-norm.c
+++ b/fs/unicode/utf8-norm.c
@@ -38,6 +38,12 @@ int utf8version_is_supported(u8 maj, u8 min, u8 rev)
 }
 EXPORT_SYMBOL(utf8version_is_supported);
 
+int utf8version_latest(void)
+{
+	return utf8vers;
+}
+EXPORT_SYMBOL(utf8version_latest);
+
 /*
  * UTF-8 valid ranges.
  *
diff --git a/fs/unicode/utf8n.h b/fs/unicode/utf8n.h
index b63a9091dc39..a120638014c1 100644
--- a/fs/unicode/utf8n.h
+++ b/fs/unicode/utf8n.h
@@ -32,6 +32,7 @@
 
 /* Highest unicode version supported by the data tables. */
 extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
+extern int utf8version_latest(void);
 
 /*
  * Look for the correct const struct utf8data for a unicode version.
diff --git a/include/linux/unicode.h b/include/linux/unicode.h
new file mode 100644
index 000000000000..aec2c6d800aa
--- /dev/null
+++ b/include/linux/unicode.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UNICODE_H
+#define _LINUX_UNICODE_H
+
+#include <linux/init.h>
+#include <linux/dcache.h>
+
+struct unicode_map {
+	const char *charset;
+	int version;
+};
+
+int utf8_validate(const struct unicode_map *um, const struct qstr *str);
+
+int utf8_strncmp(const struct unicode_map *um,
+		 const struct qstr *s1, const struct qstr *s2);
+
+int utf8_strncasecmp(const struct unicode_map *um,
+		 const struct qstr *s1, const struct qstr *s2);
+
+int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
+		   unsigned char *dest, size_t dlen);
+
+int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
+		  unsigned char *dest, size_t dlen);
+
+struct unicode_map *utf8_load(const char *version);
+void utf8_unload(struct unicode_map *um);
+
+#endif /* _LINUX_UNICODE_H */
-- 
2.20.1