[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <fcd0f413-0ae9-db25-0e0d-8d48e24f3ce6@collabora.com>
Date: Fri, 19 Mar 2021 15:56:59 +0530
From: Shreeya Patel <shreeya.patel@...labora.com>
To: Gabriel Krisman Bertazi <krisman@...labora.com>
Cc: jaegeuk@...nel.org, yuchao0@...wei.com, tytso@....edu,
adilger.kernel@...ger.ca, drosen@...gle.com, ebiggers@...gle.com,
linux-kernel@...r.kernel.org, linux-fsdevel@...r.kernel.org,
kernel@...labora.com, andre.almeida@...labora.com
Subject: Re: [PATCH v2 4/4] fs: unicode: Add utf8 module and a unicode layer
On 19/03/21 1:27 am, Gabriel Krisman Bertazi wrote:
> Shreeya,
>
Hi Gabriel,
Thanks for your detailed review. Please find my comments inline.
>> utf8data.h_shipped has a large database table which is an auto-generated
>> decodification trie for the unicode normalization functions.
>> It is not necessary to carry this large table in the kernel hence make
>>
> Thanks for the v2. This is looking much better! I have a few comments
> inline.
>
> I also have a question about how this patchset was tested. Did you run
> the normalization test (UNICODE_NORMALIZATION_SELFTEST)? What about
> actual filesystems?
Yes I did run the normalization test by loading the utf8-selftest.ko module.
modprobe utf8-selftest
For actual filesystem check I followed the blog post written by you.
https://www.collabora.com/news-and-blog/blog/2020/08/27/using-the-linux-kernel-case-insensitive-feature-in-ext4/
So the basic steps that were followed are as follows :-
1. mkfs -t ext4 -O casefold image.img
2. mount image.img /mnt ( This step would load the module )
3. cd /mnt
4. mkdir CI_dir
5. touch CI_dir/hello_world ( this step would call the inline functions )
6. ls CI_dir/HELLO_WORLD (HELLO_WORLD file should be found if our
case-insensitive fs is working as expected.)
>
> Minor nit:
>
> "It is not necessary to load this large table in the kernel in the
> kernel if no file system is using it"
>
>> UTF-8 encoding loadable by converting it into a module.
>> Also, modify the file called unicode-core which will act as a layer for
>> unicode subsystem. It will load the UTF-8 module and access it's functions
>> whenever any filesystem that needs unicode is mounted.
>>
>> Signed-off-by: Shreeya Patel <shreeya.patel@...labora.com>
>> ---
>>
>> Changes in v2
>> - Remove the duplicate file utf8-core.c
>> - Make the wrapper functions inline.
>> - Remove msleep and use try_module_get() and module_put()
>> for ensuring that module is loaded correctly and also
>> doesn't get unloaded while in use.
>> fs/unicode/Kconfig | 11 +-
>> fs/unicode/Makefile | 5 +-
>> fs/unicode/unicode-core.c | 229 +++++------------------------------
>> fs/unicode/utf8mod.c | 246 ++++++++++++++++++++++++++++++++++++++
>> include/linux/unicode.h | 73 ++++++++---
>> 5 files changed, 346 insertions(+), 218 deletions(-)
>> create mode 100644 fs/unicode/utf8mod.c
>>
>> diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
>> index 2c27b9a5cd6c..2961b0206b4d 100644
>> --- a/fs/unicode/Kconfig
>> +++ b/fs/unicode/Kconfig
>> @@ -8,7 +8,16 @@ config UNICODE
>> Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
>> support.
>>
>> +# UTF-8 encoding can be compiled as a module using UNICODE_UTF8 option.
>> +# Having UTF-8 encoding as a module will avoid carrying large
>> +# database table present in utf8data.h_shipped into the kernel
>> +# by being able to load it only when it is required by the filesystem.
>> +config UNICODE_UTF8
>> + tristate "UTF-8 module"
>> + depends on UNICODE
>> + default m
>> +
>> config UNICODE_NORMALIZATION_SELFTEST
>> tristate "Test UTF-8 normalization support"
>> - depends on UNICODE
>> + depends on UNICODE_UTF8
>> default n
>> diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
>> index fbf9a629ed0d..9dbb04194b32 100644
>> --- a/fs/unicode/Makefile
>> +++ b/fs/unicode/Makefile
>> @@ -1,11 +1,14 @@
>> # SPDX-License-Identifier: GPL-2.0
>>
>> obj-$(CONFIG_UNICODE) += unicode.o
>> +obj-$(CONFIG_UNICODE_UTF8) += utf8.o
>> obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
>>
>> -unicode-y := utf8-norm.o unicode-core.o
>> +unicode-y := unicode-core.o
>> +utf8-y := utf8mod.o utf8-norm.o
>>
>> $(obj)/utf8-norm.o: $(obj)/utf8data.h
>> +$(obj)/utf8mod.o: $(obj)/utf8-norm.o
>>
>> # In the normal build, the checked-in utf8data.h is just shipped.
>> #
>> diff --git a/fs/unicode/unicode-core.c b/fs/unicode/unicode-core.c
>> index 287a8a48836c..945984a3fe9e 100644
>> --- a/fs/unicode/unicode-core.c
>> +++ b/fs/unicode/unicode-core.c
>> @@ -1,235 +1,60 @@
>> /* SPDX-License-Identifier: GPL-2.0 */
>> #include <linux/module.h>
>> #include <linux/kernel.h>
>> -#include <linux/string.h>
>> #include <linux/slab.h>
>> -#include <linux/parser.h>
>> #include <linux/errno.h>
>> #include <linux/unicode.h>
>> -#include <linux/stringhash.h>
>>
>> -#include "utf8n.h"
>>
>> -int unicode_validate(const struct unicode_map *um, const struct qstr *str)
>> -{
>> - const struct utf8data *data = utf8nfdi(um->version);
>> -
>> - if (utf8nlen(data, str->name, str->len) < 0)
>> - return -1;
>> - return 0;
>> -}
>> -EXPORT_SYMBOL(unicode_validate);
>> -
>> -int unicode_strncmp(const struct unicode_map *um,
>> - const struct qstr *s1, const struct qstr *s2)
>> -{
>> - const struct utf8data *data = utf8nfdi(um->version);
>> - struct utf8cursor cur1, cur2;
>> - int c1, c2;
>> -
>> - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>> - return -EINVAL;
>> -
>> - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
>> - return -EINVAL;
>> -
>> - do {
>> - c1 = utf8byte(&cur1);
>> - c2 = utf8byte(&cur2);
>> -
>> - if (c1 < 0 || c2 < 0)
>> - return -EINVAL;
>> - if (c1 != c2)
>> - return 1;
>> - } while (c1);
>> -
>> - return 0;
>> -}
>> -EXPORT_SYMBOL(unicode_strncmp);
>> -
>> -int unicode_strncasecmp(const struct unicode_map *um,
>> - const struct qstr *s1, const struct qstr *s2)
>> -{
>> - const struct utf8data *data = utf8nfdicf(um->version);
>> - struct utf8cursor cur1, cur2;
>> - int c1, c2;
>> +struct unicode_ops *utf8_ops;
>> +EXPORT_SYMBOL(utf8_ops);
>>
>> - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>> - return -EINVAL;
>> -
>> - if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
>> - return -EINVAL;
>> -
>> - do {
>> - c1 = utf8byte(&cur1);
>> - c2 = utf8byte(&cur2);
>> -
>> - if (c1 < 0 || c2 < 0)
>> - return -EINVAL;
>> - if (c1 != c2)
>> - return 1;
>> - } while (c1);
>> -
>> - return 0;
>> -}
>> -EXPORT_SYMBOL(unicode_strncasecmp);
>> -
>> -/* String cf is expected to be a valid UTF-8 casefolded
>> - * string.
>> - */
>> -int unicode_strncasecmp_folded(const struct unicode_map *um,
>> - const struct qstr *cf,
>> - const struct qstr *s1)
>> +static int unicode_load_module(void)
>> {
>> - const struct utf8data *data = utf8nfdicf(um->version);
>> - struct utf8cursor cur1;
>> - int c1, c2;
>> - int i = 0;
>> -
>> - if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>> - return -EINVAL;
>> -
>> - do {
>> - c1 = utf8byte(&cur1);
>> - c2 = cf->name[i++];
>> - if (c1 < 0)
>> - return -EINVAL;
>> - if (c1 != c2)
>> - return 1;
>> - } while (c1);
>> -
>> - return 0;
>> -}
>> -EXPORT_SYMBOL(unicode_strncasecmp_folded);
>> -
>> -int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
>> - unsigned char *dest, size_t dlen)
>> -{
>> - const struct utf8data *data = utf8nfdicf(um->version);
>> - struct utf8cursor cur;
>> - size_t nlen = 0;
>> -
>> - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>> - return -EINVAL;
>> + int ret = request_module("utf8");
>>
>> - for (nlen = 0; nlen < dlen; nlen++) {
>> - int c = utf8byte(&cur);
>> -
>> - dest[nlen] = c;
>> - if (!c)
>> - return nlen;
>> - if (c == -1)
>> - break;
>> + if (ret) {
>> + pr_err("Failed to load UTF-8 module\n");
>> + return ret;
>> }
>> - return -EINVAL;
>> -}
>> -EXPORT_SYMBOL(unicode_casefold);
>> -
>> -int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
>> - struct qstr *str)
>> -{
>> - const struct utf8data *data = utf8nfdicf(um->version);
>> - struct utf8cursor cur;
>> - int c;
>> - unsigned long hash = init_name_hash(salt);
>>
>> - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>> - return -EINVAL;
>> -
>> - while ((c = utf8byte(&cur))) {
>> - if (c < 0)
>> - return -EINVAL;
>> - hash = partial_name_hash((unsigned char)c, hash);
>> - }
>> - str->hash = end_name_hash(hash);
>> return 0;
>> }
>> -EXPORT_SYMBOL(unicode_casefold_hash);
>>
>> -int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
>> - unsigned char *dest, size_t dlen)
>> +struct unicode_map *unicode_load(const char *version)
>> {
>> - const struct utf8data *data = utf8nfdi(um->version);
>> - struct utf8cursor cur;
>> - ssize_t nlen = 0;
>> + int ret = unicode_load_module();
>>
>> - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>> - return -EINVAL;
>> + if (ret)
>> + return ERR_PTR(ret);
>>
>> - for (nlen = 0; nlen < dlen; nlen++) {
>> - int c = utf8byte(&cur);
>> + if (utf8_ops && !try_module_get(utf8_ops->owner))
>> + return ERR_PTR(-ENODEV);
>>
>> - dest[nlen] = c;
>> - if (!c)
>> - return nlen;
>> - if (c == -1)
>> - break;
>> - }
>> - return -EINVAL;
>> + else
>> + return utf8_ops->load(version);
> This patch is a bit hard to review with the way git separated the
> hunks. I think you should be able to regenerate it in a simpler form
> by tinkering with git-format-patch parameters.
I agree, I will make sure to format it nicely for v3.
>
> Here, it seems that if utf8_ops is NULL (the first condition of the if
> leg), the else leg dereferences a NULL pointer by calling
> utf8_ops->load(), which can't be right.
Ah, right.
> Maybe, the if leg should be:
>
> if (!utf8_ops || !try_module_get(utf8_ops->owner)
> return ERR_PTR(-ENODEV)
>
> But this is still racy, since you are not protecting utf8_ops before
> acquiring the reference. If you race with module removal here, a
> NULL ptr dereference can still occur. See below.
If module is removed before reaching this step, then unicode_unregister
function would make utf8_ops NULL. So the first condition of if will be true
and it will return error so how can we have a NULL ptr dereference then?
>> }
>> -EXPORT_SYMBOL(unicode_normalize);
>> +EXPORT_SYMBOL(unicode_load);
>>
>> -static int unicode_parse_version(const char *version, unsigned int *maj,
>> - unsigned int *min, unsigned int *rev)
>> +void unicode_unload(struct unicode_map *um)
>> {
>> - substring_t args[3];
>> - char version_string[12];
>> - static const struct match_token token[] = {
>> - {1, "%d.%d.%d"},
>> - {0, NULL}
>> - };
>> -
>> - strscpy(version_string, version, sizeof(version_string));
>> -
>> - if (match_token(version_string, token, args) != 1)
>> - return -EINVAL;
>> + if (utf8_ops)
>> + module_put(utf8_ops->owner);
>>
> How can we have a unicode_map to free if utf8_ops is NULL? that seems
> to be an invalid use of API, which suggests a bug elsewhere
> in the kernel. maybe this should read like this:
>
> void unicode_unload(struct unicode_map *um)
> {
> if (WARN_ON(!utf8_ops))
> return;
>
> module_put(utf8_ops->owner);
> kfree(um);
> }
The reason for adding the check if(utf8_ops) is that some of the filesystem
calls the unicode_unload function even before calling the unicode_load
function.
if we try to decrement the reference without even having the reference.
( i.e. not loading the module )
it would result in kernel panic.
fs/ext4/super.c
fs/f2fs/super.c
Both the above files call the unicode_unload function if CONFIG_UNICODE
is enabled.
Not sure if this is an odd behavior or expected.
>> - if (match_int(&args[0], maj) || match_int(&args[1], min) ||
>> - match_int(&args[2], rev))
>> - return -EINVAL;
>> -
>> - return 0;
>> + kfree(um);
>> }
>> +EXPORT_SYMBOL(unicode_unload);
>>
>> -struct unicode_map *unicode_load(const char *version)
>> +void unicode_register(struct unicode_ops *ops)
>> {
>> - struct unicode_map *um = NULL;
>> - int unicode_version;
>> -
>> - if (version) {
>> - unsigned int maj, min, rev;
>> -
>> - if (unicode_parse_version(version, &maj, &min, &rev) < 0)
>> - return ERR_PTR(-EINVAL);
>> -
>> - if (!utf8version_is_supported(maj, min, rev))
>> - return ERR_PTR(-EINVAL);
>> -
>> - unicode_version = UNICODE_AGE(maj, min, rev);
>> - } else {
>> - unicode_version = utf8version_latest();
>> - printk(KERN_WARNING"UTF-8 version not specified. "
>> - "Assuming latest supported version (%d.%d.%d).",
>> - (unicode_version >> 16) & 0xff,
>> - (unicode_version >> 8) & 0xff,
>> - (unicode_version & 0xff));
>> - }
>> -
>> - um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
>> - if (!um)
>> - return ERR_PTR(-ENOMEM);
>> -
>> - um->charset = "UTF-8";
>> - um->version = unicode_version;
>> -
>> - return um;
>> + utf8_ops = ops;
> While we guarantee that utf8_ops isn't modified when a unicode_map is
> in-use by acquiring the module reference, registering/unregistering the
> module should be protected, to avoid that race I mentioned above.
>
>> }
>> -EXPORT_SYMBOL(unicode_load);
>> +EXPORT_SYMBOL(unicode_register);
>>
>> -void unicode_unload(struct unicode_map *um)
>> +void unicode_unregister(void)
>> {
>> - kfree(um);
>> + utf8_ops = NULL;
>> }
>> -EXPORT_SYMBOL(unicode_unload);
>> +EXPORT_SYMBOL(unicode_unregister);
>>
>> MODULE_LICENSE("GPL v2");
>> diff --git a/fs/unicode/utf8mod.c b/fs/unicode/utf8mod.c
>> new file mode 100644
>> index 000000000000..9981960da863
>> --- /dev/null
>> +++ b/fs/unicode/utf8mod.c
> This is a bit nitpicky, but can you follow the naming scheme and call
> this file unicode-utf8.c or maybe utf8-mod.c ?
Sure, will name it as uncide-utf8.c
>
Powered by blists - more mailing lists